From e1073ee20e0e0ceb5ad787878e7aea625b63a707 Mon Sep 17 00:00:00 2001
From: Andrew Kerr <akerr@nvidia.com>
Date: Sat, 23 Apr 2022 15:02:38 -0400
Subject: [PATCH] CUTLASS 2.9 (#468)

---
 .github/ISSUE_TEMPLATE/bug_report.md          |   23 -
 .../ISSUE_TEMPLATE/documentation_request.md   |   35 -
 .github/ISSUE_TEMPLATE/feature_request.md     |   20 -
 .github/ISSUE_TEMPLATE/submit_question.md     |   10 -
 .github/workflows/labeler.yml                 |   11 -
 .../new-issues-to-triage-projects.yml         |   35 -
 .github/workflows/stale.yml                   |   57 -
 CHANGELOG.md                                  |   66 +-
 CMakeLists.txt                                |   52 +-
 CUDA.cmake                                    |   45 +-
 LICENSE.txt                                   |   42 +-
 PUBLICATIONS.md                               |    2 +-
 README.md                                     |   93 +-
 cmake/nop.cu                                  |   42 +-
 cuBLAS.cmake                                  |   42 +-
 cuDNN.cmake                                   |   43 +-
 examples/00_basic_gemm/CMakeLists.txt         |   44 +-
 examples/00_basic_gemm/basic_gemm.cu          |   42 +-
 examples/01_cutlass_utilities/CMakeLists.txt  |   44 +-
 .../01_cutlass_utilities/cutlass_utilities.cu |   42 +-
 examples/02_dump_reg_shmem/CMakeLists.txt     |   44 +-
 examples/02_dump_reg_shmem/dump_reg_shmem.cu  |   42 +-
 examples/03_visualize_layout/CMakeLists.txt   |   44 +-
 examples/03_visualize_layout/options.h        |   42 +-
 .../03_visualize_layout/register_layout.cu    |   64 +-
 .../03_visualize_layout/register_layout.h     |   42 +-
 .../03_visualize_layout/visualize_layout.cpp  |   42 +-
 .../03_visualize_layout/visualize_layout.h    |   42 +-
 examples/04_tile_iterator/CMakeLists.txt      |   44 +-
 examples/04_tile_iterator/tile_iterator.cu    |   42 +-
 examples/05_batched_gemm/CMakeLists.txt       |   44 +-
 examples/05_batched_gemm/batched_gemm.cu      |   45 +-
 examples/06_splitK_gemm/CMakeLists.txt        |   44 +-
 examples/06_splitK_gemm/splitk_gemm.cu        |   42 +-
 .../07_volta_tensorop_gemm/CMakeLists.txt     |   44 +-
 .../volta_tensorop_gemm.cu                    |   42 +-
 .../08_turing_tensorop_gemm/CMakeLists.txt    |   44 +-
 .../turing_tensorop_gemm.cu                   |   42 +-
 .../CMakeLists.txt                            |   44 +-
 .../turing_tensorop_conv2dfprop.cu            |   64 +-
 examples/10_planar_complex/CMakeLists.txt     |   44 +-
 examples/10_planar_complex/planar_complex.cu  |   60 +-
 .../11_planar_complex_array/CMakeLists.txt    |   44 +-
 .../planar_complex_array.cu                   |   60 +-
 examples/12_gemm_bias_relu/CMakeLists.txt     |   44 +-
 examples/12_gemm_bias_relu/gemm_bias_relu.cu  |   43 +-
 .../13_two_tensor_op_fusion/CMakeLists.txt    |  105 +-
 examples/13_two_tensor_op_fusion/README.md    |   53 +-
 .../13_two_tensor_op_fusion/b2b_conv2d_run.h  |   50 +-
 .../13_two_tensor_op_fusion/b2b_gemm_run.h    |   60 +-
 .../b2b_interleaved_conv2d_run.h              |   50 +-
 .../b2b_interleaved_gemm_run.h                |   67 +-
 .../13_two_tensor_op_fusion/device/b2b_gemm.h |   48 +-
 .../device/b2b_implicit_gemm_convolution.h    |   46 +-
 .../13_two_tensor_op_fusion/fused_conv2d.cu   |  138 --
 .../13_two_tensor_op_fusion/fused_gemm.cu     |  141 --
 ..._sm75.h => fused_two_convs_f16_sm75_rf.cu} |  153 +-
 .../fused_two_convs_f16_sm75_shmem.cu         |  233 ++
 ..._sm80.h => fused_two_convs_f16_sm80_rf.cu} |   90 +-
 .../fused_two_convs_f16_sm80_shmem.cu         |  233 ++
 ...2_sm75.h => fused_two_convs_s8_sm75_rf.cu} |  146 +-
 .../fused_two_convs_s8_sm75_shmem.cu          |  235 ++
 ...2_sm80.h => fused_two_convs_s8_sm80_rf.cu} |   81 +-
 .../fused_two_convs_s8_sm80_shmem.cu          |  234 ++
 ..._sm75.h => fused_two_gemms_f16_sm75_rf.cu} |   85 +-
 .../fused_two_gemms_f16_sm75_shmem.cu         |  211 ++
 ..._sm80.h => fused_two_gemms_f16_sm80_rf.cu} |   69 +-
 .../fused_two_gemms_f16_sm80_shmem.cu         |  214 ++
 ...2_sm75.h => fused_two_gemms_s8_sm75_rf.cu} |   80 +-
 .../fused_two_gemms_s8_sm75_shmem.cu          |  211 ++
 ...2_sm80.h => fused_two_gemms_s8_sm80_rf.cu} |   80 +-
 .../fused_two_gemms_s8_sm80_shmem.cu          |  225 ++
 .../13_two_tensor_op_fusion/kernel/b2b_gemm.h |   55 +-
 .../kernel/b2b_implicit_gemm_convolution.h    |   42 +-
 .../kernel/default_b2b_conv2d_fprop.h         | 2135 +----------------
 .../kernel/default_b2b_conv2d_fprop_sm75.h    |  749 ++++++
 .../kernel/default_b2b_conv2d_fprop_sm80.h    |  740 ++++++
 ...t_b2b_conv2d_fprop_smem_accumulator_sm75.h |  817 +++++++
 ...t_b2b_conv2d_fprop_smem_accumulator_sm80.h |  804 +++++++
 .../kernel/default_b2b_gemm.h                 |   46 +-
 .../default_b2b_gemm_smem_accumulator.h       |  397 +++
 examples/13_two_tensor_op_fusion/test_run.h   |   95 +
 .../b2b_implicit_gemm_multistage.h            |   44 +-
 ...mplicit_gemm_multistage_smem_accumulator.h |  816 +++++++
 .../threadblock/b2b_implicit_gemm_pipelined.h |   42 +-
 ...implicit_gemm_pipelined_smem_accumulator.h |   61 +-
 .../threadblock/b2b_mma_base.h                |   42 +-
 .../b2b_mma_base_smem_accumulator.h           |   47 +-
 .../threadblock/b2b_mma_multistage.h          |   42 +-
 .../b2b_mma_multistage_smem_accumulator.h     |  860 +++++++
 .../threadblock/b2b_mma_pipelined.h           |  129 +-
 .../b2b_mma_pipelined_smem_accumulator.h      |  541 +++++
 .../threadblock/default_b2b_mma.h             |   46 +-
 .../default_b2b_mma_smem_accumulator.h        |  605 +++++
 .../CMakeLists.txt                            |   44 +-
 .../ampere_tf32_tensorop_gemm.cu              |   54 +-
 .../CMakeLists.txt                            |   44 +-
 .../ampere_sparse_tensorop_gemm.cu            |   42 +-
 .../CMakeLists.txt                            |   44 +-
 .../ampere_tensorop_conv2dfprop.cu            |   92 +-
 .../17_fprop_per_channel_bias/CMakeLists.txt  |   44 +-
 .../fprop_per_channel_bias.cu                 |   42 +-
 .../CMakeLists.txt                            |   44 +-
 .../ampere_fp64_tensorop_affine2_gemm.cu      |   59 +-
 examples/19_tensorop_canonical/CMakeLists.txt |   44 +-
 .../tensorop_canonical.cu                     |   42 +-
 examples/20_simt_canonical/CMakeLists.txt     |   44 +-
 examples/20_simt_canonical/simt_canonical.cu  |   42 +-
 examples/21_quaternion_gemm/CMakeLists.txt    |   44 +-
 .../21_quaternion_gemm/quaternion_gemm.cu     |   68 +-
 examples/22_quaternion_conv/CMakeLists.txt    |   44 +-
 .../22_quaternion_conv/quaternion_conv.cu     |   64 +-
 .../CMakeLists.txt                            |   44 +-
 .../ampere_gemm_operand_reduction_fusion.cu   |   66 +-
 examples/24_gemm_grouped/CMakeLists.txt       |   44 +-
 examples/24_gemm_grouped/gemm_grouped.cu      |  229 +-
 .../CMakeLists.txt                            |   44 +-
 .../ampere_fprop_mainloop_fusion.cu           |   96 +-
 .../CMakeLists.txt                            |   44 +-
 .../ampere_wgrad_mainloop_fusion.cu           |   96 +-
 ...pere_3xtf32_fast_accurate_tensorop_gemm.cu |   58 +-
 .../CMakeLists.txt                            |   44 +-
 .../CMakeLists.txt                            |   44 +-
 ...ere_3xtf32_fast_accurate_tensorop_fprop.cu |   64 +-
 ...f32_fast_accurate_tensorop_complex_gemm.cu |   58 +-
 .../CMakeLists.txt                            |   44 +-
 examples/30_wgrad_split_k/30_wgrad_split_k.cu |   71 +-
 examples/30_wgrad_split_k/CMakeLists.txt      |   44 +-
 examples/31_basic_syrk/CMakeLists.txt         |   35 +
 examples/31_basic_syrk/basic_syrk.cu          |  522 ++++
 examples/32_basic_trmm/CMakeLists.txt         |   35 +
 examples/32_basic_trmm/basic_trmm.cu          |  550 +++++
 .../CMakeLists.txt                            |   35 +
 .../ampere_3xtf32_tensorop_symm.cu            |  687 ++++++
 .../34_transposed_conv2d.cu                   |   69 +-
 examples/34_transposed_conv2d/CMakeLists.txt  |   43 +-
 examples/35_gemm_softmax/CMakeLists.txt       |   36 +
 .../35_gemm_softmax/epilogue_with_visitor.h   |  407 ++++
 examples/35_gemm_softmax/gemm_softmax.cu      |  681 ++++++
 .../gemm_with_epilogue_visitor.h              |  498 ++++
 examples/35_gemm_softmax/gemm_with_softmax.h  | 1154 +++++++++
 .../36_gather_scatter_fusion/CMakeLists.txt   |   32 +
 .../gather_scatter_fusion.cu                  |  537 +++++
 examples/40_cutlass_py/test-cutlass-py.py     |  170 ++
 examples/CMakeLists.txt                       |   49 +-
 include/cutlass/aligned_buffer.h              |   42 +-
 include/cutlass/arch/arch.h                   |   42 +-
 include/cutlass/arch/cache_operation.h        |   42 +-
 include/cutlass/arch/memory.h                 |  154 +-
 include/cutlass/arch/memory_sm75.h            |   42 +-
 include/cutlass/arch/memory_sm80.h            |  138 +-
 include/cutlass/arch/mma.h                    |   42 +-
 include/cutlass/arch/mma_sm50.h               |   42 +-
 include/cutlass/arch/mma_sm60.h               |   42 +-
 include/cutlass/arch/mma_sm61.h               |   42 +-
 include/cutlass/arch/mma_sm70.h               |   66 +-
 include/cutlass/arch/mma_sm75.h               |   42 +-
 include/cutlass/arch/mma_sm80.h               |   42 +-
 include/cutlass/arch/mma_sparse_sm80.h        |   42 +-
 include/cutlass/arch/simd.h                   |   42 +-
 include/cutlass/arch/simd_sm60.h              |   42 +-
 include/cutlass/arch/simd_sm61.h              |   42 +-
 include/cutlass/arch/wmma.h                   |   42 +-
 include/cutlass/arch/wmma_sm70.h              |   42 +-
 include/cutlass/arch/wmma_sm72.h              |   42 +-
 include/cutlass/arch/wmma_sm75.h              |   42 +-
 include/cutlass/array.h                       |   46 +-
 include/cutlass/array_planar_complex.h        |   42 +-
 include/cutlass/array_subbyte.h               |   54 +-
 include/cutlass/bfloat16.h                    |   42 +-
 include/cutlass/blas3.h                       |  175 ++
 include/cutlass/complex.h                     |   59 +-
 include/cutlass/constants.h                   |   48 +-
 include/cutlass/conv/conv2d_problem_size.h    |  114 +-
 include/cutlass/conv/conv3d_problem_size.h    |   45 +-
 include/cutlass/conv/convolution.h            |   46 +-
 .../conv/device/implicit_gemm_convolution.h   |   42 +-
 .../device/implicit_gemm_convolution_fusion.h |   42 +-
 include/cutlass/conv/kernel/default_conv2d.h  |   46 +-
 .../conv/kernel/default_conv2d_dgrad.h        |   66 +-
 .../conv/kernel/default_conv2d_fprop.h        |  557 ++++-
 .../conv/kernel/default_conv2d_fprop_fusion.h |   46 +-
 .../default_conv2d_fprop_with_broadcast.h     |   42 +-
 .../default_conv2d_fprop_with_reduction.h     |   42 +-
 .../conv/kernel/default_conv2d_wgrad.h        |   58 +-
 .../conv/kernel/default_conv2d_wgrad_fusion.h |   46 +-
 .../conv/kernel/default_conv3d_dgrad.h        |   46 +-
 .../conv/kernel/default_conv3d_fprop.h        |   46 +-
 .../conv/kernel/default_conv3d_wgrad.h        |   50 +-
 .../conv/kernel/implicit_gemm_convolution.h   |   49 +-
 .../kernel/implicit_gemm_convolution_fusion.h |   42 +-
 .../implicit_gemm_convolution_strided_dgrad.h |   42 +-
 ...cit_gemm_convolution_with_fused_epilogue.h |   43 +-
 ...rad_filter_tile_access_iterator_analytic.h |   42 +-
 ...ad_filter_tile_access_iterator_optimized.h |   42 +-
 ...t_gradient_tile_access_iterator_analytic.h |   42 +-
 ..._gradient_tile_access_iterator_optimized.h |   42 +-
 ...activation_tile_access_iterator_analytic.h |   42 +-
 ...vation_tile_access_iterator_few_channels.h |  360 +++
 ...tion_tile_access_iterator_fixed_channels.h |  353 +++
 ...ctivation_tile_access_iterator_optimized.h |   42 +-
 ...rop_filter_tile_access_iterator_analytic.h |   42 +-
 ...filter_tile_access_iterator_few_channels.h |  289 +++
 ...lter_tile_access_iterator_fixed_channels.h |  275 +++
 ...op_filter_tile_access_iterator_optimized.h |   42 +-
 .../cutlass/conv/threadblock/conv2d_params.h  |   91 +-
 .../conv/threadblock/conv2d_tile_iterator.h   |   42 +-
 ...activation_tile_access_iterator_analytic.h |   44 +-
 ...ctivation_tile_access_iterator_optimized.h |   58 +-
 ...t_gradient_tile_access_iterator_analytic.h |   42 +-
 ..._gradient_tile_access_iterator_optimized.h |   42 +-
 ...rad_filter_tile_access_iterator_analytic.h |   42 +-
 ...ad_filter_tile_access_iterator_optimized.h |   42 +-
 ...t_gradient_tile_access_iterator_analytic.h |   42 +-
 ..._gradient_tile_access_iterator_optimized.h |   42 +-
 ...activation_tile_access_iterator_analytic.h |   42 +-
 ...ctivation_tile_access_iterator_optimized.h |   42 +-
 ...rop_filter_tile_access_iterator_analytic.h |   42 +-
 ...op_filter_tile_access_iterator_optimized.h |   42 +-
 .../cutlass/conv/threadblock/conv3d_params.h  |   42 +-
 ...activation_tile_access_iterator_analytic.h |   42 +-
 ...ctivation_tile_access_iterator_optimized.h |   42 +-
 ...t_gradient_tile_access_iterator_analytic.h |   42 +-
 ..._gradient_tile_access_iterator_optimized.h |   42 +-
 .../implicit_gemm_fprop_fusion_multistage.h   |   42 +-
 .../threadblock/implicit_gemm_multistage.h    |   44 +-
 .../threadblock/implicit_gemm_pipelined.h     |   42 +-
 .../implicit_gemm_wgrad_fusion_multistage.h   |   42 +-
 ...icated_scale_bias_vector_access_iterator.h |   42 +-
 .../predicated_scale_bias_vector_iterator.h   |   42 +-
 ...egular_scale_bias_vector_access_iterator.h |   42 +-
 .../conv/threadblock/threadblock_swizzle.h    |   42 +-
 .../warp/conv2d_fprop_scale_bias_iterator.h   |   42 +-
 .../conv/warp/scale_bias_relu_transform.h     |   42 +-
 include/cutlass/coord.h                       |   42 +-
 include/cutlass/core_io.h                     |   48 +-
 include/cutlass/cutlass.h                     |   72 +-
 include/cutlass/device_kernel.h               |   42 +-
 include/cutlass/epilogue/thread/activation.h  |   45 +-
 .../cutlass/epilogue/thread/conversion_op.h   |   42 +-
 .../epilogue/thread/linear_combination.h      |   43 +-
 .../linear_combination_bias_elementwise.h     |   42 +-
 .../thread/linear_combination_bias_relu.h     |   42 +-
 .../thread/linear_combination_clamp.h         |   42 +-
 .../thread/linear_combination_dgelu.h         |   42 +-
 .../thread/linear_combination_drelu.h         |   42 +-
 .../epilogue/thread/linear_combination_gelu.h |   42 +-
 .../thread/linear_combination_generic.h       |   42 +-
 .../thread/linear_combination_hardswish.h     |   42 +-
 .../linear_combination_planar_complex.h       |   42 +-
 .../epilogue/thread/linear_combination_relu.h |   42 +-
 .../thread/linear_combination_relu0.h         |   42 +-
 .../linear_combination_residual_block.h       |   46 +-
 .../thread/linear_combination_sigmoid.h       |   42 +-
 .../epilogue/thread/linear_combination_silu.h |   42 +-
 .../linear_combination_with_elementwise.h     |   42 +-
 .../cutlass/epilogue/thread/reduction_op.h    |   42 +-
 include/cutlass/epilogue/thread/scale_type.h  |   42 +-
 .../default_epilogue_complex_tensor_op.h      |   42 +-
 ...default_epilogue_complex_tensor_op_blas3.h |  264 ++
 .../default_epilogue_direct_store.h           |   42 +-
 .../default_epilogue_planar_complex.h         |   42 +-
 .../threadblock/default_epilogue_simt.h       |   48 +-
 .../threadblock/default_epilogue_tensor_op.h  |  196 +-
 .../default_epilogue_tensor_op_blas3.h        |  175 ++
 .../default_epilogue_volta_tensor_op.h        |   48 +-
 .../default_epilogue_with_broadcast.h         |   48 +-
 .../default_epilogue_with_reduction.h         |   54 +-
 .../default_epilogue_wmma_tensor_op.h         |   48 +-
 .../threadblock/default_thread_map_simt.h     |   42 +-
 .../default_thread_map_tensor_op.h            |   42 +-
 .../default_thread_map_volta_tensor_op.h      |   42 +-
 .../default_thread_map_wmma_tensor_op.h       |   42 +-
 .../direct_store_epilogue_iterator.h          |   45 +-
 .../cutlass/epilogue/threadblock/epilogue.h   |   45 +-
 .../epilogue/threadblock/epilogue_base.h      |   42 +-
 .../threadblock/epilogue_direct_store.h       |   42 +-
 .../threadblock/epilogue_gemm_k_reduction.h   |   44 +-
 .../threadblock/epilogue_planar_complex.h     |   44 +-
 .../threadblock/epilogue_smem_accumulator.h   |   93 +-
 .../threadblock/epilogue_with_broadcast.h     |   44 +-
 .../threadblock/epilogue_with_reduction.h     |   42 +-
 .../epilogue/threadblock/epilogue_workspace.h |   42 +-
 .../threadblock/interleaved_epilogue.h        |   44 +-
 .../threadblock/output_tile_thread_map.h      |  102 +-
 .../threadblock/predicated_tile_iterator.h    |  263 +-
 .../predicated_tile_iterator_affine.h         |  105 +-
 .../predicated_tile_iterator_blas3.h          |  633 +++++
 .../predicated_tile_iterator_params.h         |   54 +-
 .../predicated_tile_iterator_predicates.h     |   42 +-
 .../predicated_tile_iterator_strided_dgrad.h  |   42 +-
 .../threadblock/shared_load_iterator.h        |   42 +-
 .../threadblock/shared_load_iterator_mixed.h  |   56 +-
 .../fragment_iterator_complex_tensor_op.h     |   42 +-
 ...ment_iterator_gaussian_complex_tensor_op.h |   42 +-
 .../epilogue/warp/fragment_iterator_simt.h    |   42 +-
 .../warp/fragment_iterator_tensor_op.h        |   42 +-
 .../warp/fragment_iterator_volta_tensor_op.h  |   42 +-
 .../warp/fragment_iterator_wmma_tensor_op.h   |   42 +-
 include/cutlass/epilogue/warp/simt_policy.h   |   42 +-
 .../cutlass/epilogue/warp/tensor_op_policy.h  |   42 +-
 .../epilogue/warp/tile_iterator_simt.h        |   42 +-
 .../epilogue/warp/tile_iterator_tensor_op.h   |   45 +-
 .../warp/tile_iterator_tensor_op_mixed.h      |  108 +-
 .../warp/tile_iterator_volta_tensor_op.h      |   42 +-
 .../warp/tile_iterator_wmma_tensor_op.h       |   42 +-
 .../epilogue/warp/volta_tensor_op_policy.h    |   42 +-
 .../epilogue/warp/wmma_tensor_op_policy.h     |   42 +-
 include/cutlass/fast_math.h                   |   58 +-
 include/cutlass/functional.h                  |   70 +-
 .../gemm/device/default_gemm_configuration.h  |   44 +-
 include/cutlass/gemm/device/gemm.h            |  110 +-
 include/cutlass/gemm/device/gemm_array.h      |   49 +-
 include/cutlass/gemm/device/gemm_batched.h    |   49 +-
 include/cutlass/gemm/device/gemm_complex.h    |   42 +-
 include/cutlass/gemm/device/gemm_grouped.h    |   72 +-
 include/cutlass/gemm/device/gemm_sparse.h     |   42 +-
 .../gemm/device/gemm_splitk_parallel.h        |   42 +-
 include/cutlass/gemm/device/gemm_universal.h  |   80 +-
 .../gemm/device/gemm_universal_adapter.h      |   44 +-
 .../cutlass/gemm/device/gemm_universal_base.h |   49 +-
 include/cutlass/gemm/device/gemv.h            |   42 +-
 include/cutlass/gemm/device/rank_2k.h         |  547 +++++
 include/cutlass/gemm/device/rank_k.h          |  509 ++++
 include/cutlass/gemm/device/symm.h            |  602 +++++
 include/cutlass/gemm/device/trmm.h            |  758 ++++++
 include/cutlass/gemm/gemm.h                   |   43 +-
 include/cutlass/gemm/kernel/default_gemm.h    |  172 +-
 .../gemm/kernel/default_gemm_complex.h        |   42 +-
 .../gemm/kernel/default_gemm_grouped.h        |   42 +-
 .../default_gemm_planar_complex_universal.h   |   46 +-
 .../cutlass/gemm/kernel/default_gemm_sparse.h |   42 +-
 .../kernel/default_gemm_splitk_parallel.h     |   42 +-
 .../gemm/kernel/default_gemm_universal.h      |   71 +-
 .../gemm/kernel/default_gemm_with_broadcast.h |   42 +-
 .../kernel/default_gemm_with_k_reduction.h    |   42 +-
 .../gemm/kernel/default_gemm_with_reduction.h |   42 +-
 include/cutlass/gemm/kernel/default_gemv.h    |   42 +-
 include/cutlass/gemm/kernel/default_rank_2k.h |  207 ++
 .../gemm/kernel/default_rank_2k_complex.h     |  334 +++
 .../gemm/kernel/default_rank_2k_universal.h   |  346 +++
 include/cutlass/gemm/kernel/default_rank_k.h  |  185 ++
 .../gemm/kernel/default_rank_k_complex.h      |  295 +++
 .../gemm/kernel/default_rank_k_universal.h    |  305 +++
 include/cutlass/gemm/kernel/default_symm.h    |  227 ++
 .../gemm/kernel/default_symm_complex.h        |  316 +++
 .../gemm/kernel/default_symm_universal.h      |  342 +++
 include/cutlass/gemm/kernel/default_trmm.h    |  199 ++
 .../gemm/kernel/default_trmm_complex.h        |  197 ++
 .../gemm/kernel/default_trmm_universal.h      |  359 +++
 include/cutlass/gemm/kernel/gemm.h            |   84 +-
 include/cutlass/gemm/kernel/gemm_array.h      |   42 +-
 include/cutlass/gemm/kernel/gemm_batched.h    |   42 +-
 include/cutlass/gemm/kernel/gemm_grouped.h    |   44 +-
 include/cutlass/gemm/kernel/gemm_params.h     |   42 +-
 include/cutlass/gemm/kernel/gemm_pipelined.h  |   42 +-
 .../cutlass/gemm/kernel/gemm_planar_complex.h |   67 +-
 .../gemm/kernel/gemm_planar_complex_array.h   |   67 +-
 .../gemm/kernel/gemm_splitk_parallel.h        |   42 +-
 .../gemm/kernel/gemm_transpose_operands.h     |   42 +-
 include/cutlass/gemm/kernel/gemm_universal.h  |  165 +-
 .../gemm/kernel/gemm_with_fused_epilogue.h    |   88 +-
 .../gemm/kernel/gemm_with_k_reduction.h       |   96 +-
 include/cutlass/gemm/kernel/gemv.h            |   42 +-
 .../gemm/kernel/gemv_batched_strided.h        |   42 +-
 .../cutlass/gemm/kernel/rank_2k_universal.h   |  778 ++++++
 .../cutlass/gemm/kernel/rank_k_universal.h    |  565 +++++
 include/cutlass/gemm/kernel/sparse_gemm.h     |   42 +-
 include/cutlass/gemm/kernel/symm_universal.h  |  698 ++++++
 include/cutlass/gemm/kernel/trmm_universal.h  |  599 +++++
 include/cutlass/gemm/thread/mma.h             |   42 +-
 include/cutlass/gemm/thread/mma_sm50.h        |  164 +-
 include/cutlass/gemm/thread/mma_sm60.h        |   42 +-
 include/cutlass/gemm/thread/mma_sm61.h        |   42 +-
 .../gemm/threadblock/default_gemv_core.h      |   42 +-
 .../cutlass/gemm/threadblock/default_mma.h    |  129 +-
 .../gemm/threadblock/default_mma_core.h       |   42 +-
 .../gemm/threadblock/default_mma_core_simt.h  |   42 +-
 .../gemm/threadblock/default_mma_core_sm70.h  |   42 +-
 .../gemm/threadblock/default_mma_core_sm75.h  |   42 +-
 .../gemm/threadblock/default_mma_core_sm80.h  |   42 +-
 .../default_mma_core_sparse_sm80.h            |   42 +-
 .../default_mma_core_with_access_size.h       |  291 +++
 .../default_mma_core_with_reduction.h         |   42 +-
 .../gemm/threadblock/default_mma_core_wmma.h  |   42 +-
 .../default_mma_planar_complex_multistage.h   |   42 +-
 .../default_mma_planar_complex_pipelined.h    |   42 +-
 .../threadblock/default_mma_with_reduction.h  |   42 +-
 .../default_multistage_mma_complex.h          |   42 +-
 .../default_multistage_mma_complex_core.h     |   42 +-
 ...default_multistage_mma_complex_core_sm80.h |   42 +-
 .../default_multistage_trmm_complex.h         |  556 +++++
 .../gemm/threadblock/default_sparse_mma.h     |   42 +-
 .../cutlass/gemm/threadblock/default_trmm.h   |  445 ++++
 include/cutlass/gemm/threadblock/gemv.h       |   42 +-
 include/cutlass/gemm/threadblock/mma_base.h   |   42 +-
 .../gemm/threadblock/mma_blas3_multistage.h   |  706 ++++++
 .../cutlass/gemm/threadblock/mma_multistage.h |   42 +-
 .../cutlass/gemm/threadblock/mma_pipelined.h  |   42 +-
 .../threadblock/mma_planar_complex_base.h     |   42 +-
 .../mma_planar_complex_multistage.h           |   42 +-
 .../mma_planar_complex_pipelined.h            |   42 +-
 .../gemm/threadblock/mma_singlestage.h        |   42 +-
 .../gemm/threadblock/mma_sparse_base.h        |   42 +-
 .../gemm/threadblock/mma_sparse_multistage.h  |   42 +-
 .../mma_with_reduction_multistage.h           |   42 +-
 .../gemm/threadblock/threadblock_swizzle.h    |   42 +-
 .../gemm/warp/default_mma_complex_tensor_op.h |   42 +-
 .../gemm/warp/default_mma_sparse_tensor_op.h  |   42 +-
 .../cutlass/gemm/warp/default_mma_tensor_op.h |   42 +-
 .../gemm/warp/default_mma_tensor_op_sm80.h    |   42 +-
 .../default_mma_with_reduction_tensor_op.h    |   42 +-
 .../gemm/warp/default_mma_wmma_tensor_op.h    |   42 +-
 include/cutlass/gemm/warp/mma.h               |   42 +-
 .../cutlass/gemm/warp/mma_complex_tensor_op.h |   42 +-
 .../warp/mma_complex_tensor_op_fast_f32.h     |   42 +-
 ...mma_complex_tensor_op_tile_iterator_sm80.h |   42 +-
 .../warp/mma_gaussian_complex_tensor_op.h     |   42 +-
 ...ian_complex_tensor_op_tile_iterator_sm80.h |   42 +-
 .../cutlass/gemm/warp/mma_planar_complex.h    |   42 +-
 include/cutlass/gemm/warp/mma_simt.h          |   42 +-
 include/cutlass/gemm/warp/mma_simt_policy.h   |   42 +-
 .../gemm/warp/mma_simt_tile_iterator.h        |   42 +-
 .../cutlass/gemm/warp/mma_sparse_tensor_op.h  |   42 +-
 include/cutlass/gemm/warp/mma_tensor_op.h     |   42 +-
 .../gemm/warp/mma_tensor_op_fast_f32.h        |   42 +-
 .../cutlass/gemm/warp/mma_tensor_op_policy.h  |   42 +-
 .../cutlass/gemm/warp/mma_tensor_op_sm70.h    |   42 +-
 .../gemm/warp/mma_tensor_op_tile_iterator.h   |   42 +-
 .../warp/mma_tensor_op_tile_iterator_sm70.h   |   42 +-
 .../warp/mma_tensor_op_tile_iterator_sm80.h   |   42 +-
 .../warp/mma_tensor_op_tile_iterator_sparse.h |   42 +-
 .../warp/mma_tensor_op_tile_iterator_wmma.h   |   42 +-
 .../cutlass/gemm/warp/mma_tensor_op_wmma.h    |   42 +-
 .../gemm/warp/mma_with_reduction_tensor_op.h  |   42 +-
 .../gemm/warp/tile_iterator_planar_complex.h  |   42 +-
 include/cutlass/half.h                        |   42 +-
 include/cutlass/integer_subbyte.h             |   42 +-
 include/cutlass/kernel_launch.h               |   42 +-
 include/cutlass/layout/layout.h               |   42 +-
 include/cutlass/layout/matrix.h               |   42 +-
 include/cutlass/layout/pitch_linear.h         |   42 +-
 include/cutlass/layout/tensor.h               |   44 +-
 .../layout/tensor_op_multiplicand_sm70.h      |   42 +-
 .../layout/tensor_op_multiplicand_sm75.h      |   42 +-
 .../layout/tensor_op_multiplicand_sm80.h      |   42 +-
 include/cutlass/layout/vector.h               |   42 +-
 include/cutlass/matrix.h                      |   42 +-
 include/cutlass/matrix_coord.h                |   42 +-
 include/cutlass/matrix_shape.h                |   42 +-
 include/cutlass/numeric_conversion.h          |   77 +-
 include/cutlass/numeric_types.h               |   64 +-
 include/cutlass/pitch_linear_coord.h          |   42 +-
 include/cutlass/platform/platform.h           |   42 +-
 include/cutlass/predicate_vector.h            |   42 +-
 include/cutlass/quaternion.h                  |   50 +-
 include/cutlass/real.h                        |   42 +-
 .../cutlass/reduction/device/reduce_split_k.h |   42 +-
 .../cutlass/reduction/device/tensor_reduce.h  |   42 +-
 .../device/tensor_reduce_affine_contiguous.h  |   42 +-
 .../device/tensor_reduce_affine_strided.h     |   42 +-
 .../cutlass/reduction/kernel/reduce_split_k.h |   42 +-
 .../kernel/tensor_reduce_affine_contiguous.h  |   42 +-
 .../kernel/tensor_reduce_affine_strided.h     |   42 +-
 include/cutlass/reduction/thread/reduce.h     |   42 +-
 .../reduction/thread/reduction_operators.h    |   42 +-
 .../cutlass/reduction/threadblock_swizzle.h   |   48 +-
 include/cutlass/relatively_equal.h            |   42 +-
 include/cutlass/semaphore.h                   |   42 +-
 include/cutlass/subbyte_reference.h           |   42 +-
 include/cutlass/tensor_coord.h                |   42 +-
 include/cutlass/tensor_ref.h                  |   44 +-
 include/cutlass/tensor_ref_planar_complex.h   |   42 +-
 include/cutlass/tensor_view.h                 |   42 +-
 include/cutlass/tensor_view_planar_complex.h  |   42 +-
 include/cutlass/tfloat32.h                    |   42 +-
 include/cutlass/thread/matrix.h               |   42 +-
 include/cutlass/trace.h                       |   42 +-
 .../transform/pitch_linear_thread_map.h       |   42 +-
 include/cutlass/transform/thread/transpose.h  |   42 +-
 include/cutlass/transform/thread/unaryOp.h    |   42 +-
 .../predicated_tile_access_iterator.h         |  219 +-
 ...icated_tile_access_iterator_2dthreadtile.h |   46 +-
 .../predicated_tile_access_iterator_params.h  |   71 +-
 ...d_tile_access_iterator_triangular_matrix.h |  892 +++++++
 .../threadblock/predicated_tile_iterator.h    |  117 +-
 .../predicated_tile_iterator_2dthreadtile.h   |   52 +-
 ...edicated_tile_iterator_triangular_matrix.h |  818 +++++++
 .../predicated_vector_access_iterator.h       |   42 +-
 .../regular_tile_access_iterator.h            |   42 +-
 ...egular_tile_access_iterator_pitch_linear.h |   42 +-
 .../regular_tile_access_iterator_tensor_op.h  |   42 +-
 ...ular_tile_access_iterator_tensor_op_sm80.h |   44 +-
 .../threadblock/regular_tile_iterator.h       |   42 +-
 .../regular_tile_iterator_pitch_linear.h      |   42 +-
 ..._tile_iterator_pitch_linear_2dthreadtile.h |   42 +-
 .../regular_tile_iterator_tensor_op.h         |   42 +-
 .../regular_tile_iterator_tensor_op_sm70.h    |   42 +-
 .../transform/threadblock/vector_iterator.h   |   42 +-
 .../transform/warp/vector_fragment_iterator.h |   42 +-
 include/cutlass/uint128.h                     |   56 +-
 include/cutlass/wmma_array.h                  |   42 +-
 media/docs/code_organization.md               |   48 +-
 media/docs/doxygen_mainpage.md                |   44 +-
 media/docs/efficient_gemm.md                  |   44 +-
 media/docs/functionality.md                   |   44 +-
 media/docs/fundamental_types.md               |   44 +-
 media/docs/gemm_api.md                        |   44 +-
 media/docs/implicit_gemm_convolution.md       |   48 +-
 media/docs/layout.md                          |   44 +-
 media/docs/profiler.md                        |   44 +-
 media/docs/programming_guidelines.md          |   44 +-
 media/docs/quickstart.md                      |   48 +-
 media/docs/terminology.md                     |   44 +-
 media/docs/tile_iterator_concept.md           |   44 +-
 media/docs/utilities.md                       |   44 +-
 .../13_example_shmem_resident_fusion.png      |  Bin 0 -> 18318 bytes
 test/CMakeLists.txt                           |   42 +-
 test/unit/CMakeLists.txt                      |   42 +-
 test/unit/common/cutlass_unit_test.h          |   42 +-
 test/unit/common/filter_architecture.cpp      |   42 +-
 test/unit/conv/CMakeLists.txt                 |   42 +-
 test/unit/conv/device/CMakeLists.txt          |   49 +-
 test/unit/conv/device/cache_testbed_output.h  |   42 +-
 ...f32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu |   42 +-
 ...f32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu |   42 +-
 ...nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu |   42 +-
 ...nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu |   42 +-
 ...nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu |   42 +-
 ...nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu |   42 +-
 ...m_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu |   42 +-
 ...hwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu |   42 +-
 ...nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu |  303 +++
 ...nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu |  240 ++
 ...f32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu |   42 +-
 ...f32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu |   42 +-
 ...m_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu |   42 +-
 ...nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu |   42 +-
 ...nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu |   44 +-
 ...nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu |   45 +-
 ...nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu |   42 +-
 ...nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu |   42 +-
 ...m_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm50.cu |   42 +-
 ...m_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu |   42 +-
 ...f32nhwc_qf32nhwc_qf32nhwc_simt_f32_sm50.cu |   42 +-
 ...wx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm75.cu |   42 +-
 ...wx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm80.cu |   42 +-
 ...4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm75.cu |   46 +-
 ...4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm80.cu |   46 +-
 ...wx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.cu |   42 +-
 ...wx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm80.cu |   42 +-
 ...8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm75.cu |   46 +-
 ...8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm80.cu |   46 +-
 ...hwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu |   42 +-
 .../conv2d_fprop_with_broadcast_sm70.cu       |   44 +-
 .../conv2d_fprop_with_broadcast_sm75.cu       |   43 +-
 .../conv2d_fprop_with_reduction_sm75.cu       |   42 +-
 test/unit/conv/device/conv2d_problems.h       |   43 +-
 ...nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu |   42 +-
 test/unit/conv/device/conv2d_testbed.h        |  155 +-
 .../conv/device/conv2d_testbed_interleaved.h  |   42 +-
 ...f32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu |   42 +-
 ...f32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu |   42 +-
 ...nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu |   42 +-
 ...nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu |   42 +-
 ...nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu |   42 +-
 ...nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu |   42 +-
 ...m_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu |   42 +-
 ...hwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu |  100 +-
 .../device/conv2d_with_broadcast_testbed.h    |   60 +-
 .../device/conv2d_with_reduction_testbed.h    |   42 +-
 ...wc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu |   42 +-
 ...c_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu |   42 +-
 ...wc_f16ndhwc_f32ndhwc_tensor_op_f32_sm75.cu |   42 +-
 ...wc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu |   42 +-
 ...c_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu |   42 +-
 test/unit/conv/device/conv3d_problems.h       |   42 +-
 test/unit/conv/device/conv3d_testbed.h        |   42 +-
 ...wc_f16ndhwc_f32ndhwc_tensor_op_f32_sm75.cu |   42 +-
 ...wc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu |   42 +-
 ...c_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu |   42 +-
 test/unit/core/CMakeLists.txt                 |   42 +-
 test/unit/core/array.cu                       |   42 +-
 test/unit/core/bfloat16.cu                    |   42 +-
 test/unit/core/complex.cu                     |   42 +-
 test/unit/core/functional.cu                  |   42 +-
 test/unit/core/half.cu                        |   42 +-
 test/unit/core/matrix.cu                      |   42 +-
 test/unit/core/matrix_coord.cu                |   48 +-
 test/unit/core/numeric_conversion.cu          |   42 +-
 test/unit/core/predicate_vector.cu            |   42 +-
 test/unit/core/quaternion.cu                  |   42 +-
 test/unit/core/tensor_ref.cu                  |   42 +-
 test/unit/core/tensor_view.cu                 |   42 +-
 test/unit/core/test_unit_core.cpp             |   42 +-
 test/unit/core/tfloat32.cu                    |   42 +-
 ...est_unit_conv_device_tensorop_f32_sm80.txt |  798 +++++-
 test/unit/epilogue/CMakeLists.txt             |   42 +-
 test/unit/epilogue/thread/CMakeLists.txt      |   42 +-
 test/unit/epilogue/thread/activation.cu       |   42 +-
 .../epilogue/thread/linear_combination.cu     |   42 +-
 .../linear_combination_planar_complex.cu      |   42 +-
 test/unit/epilogue/threadblock/CMakeLists.txt |   42 +-
 .../threadblock/epilogue_planar_complex.cu    |   42 +-
 .../epilogue/threadblock/epilogue_simt.cu     |   42 +-
 .../threadblock/epilogue_simt_sm60.cu         |   42 +-
 .../threadblock/epilogue_simt_sm61.cu         |   42 +-
 .../threadblock/epilogue_tensor_op.cu         |   52 +-
 .../threadblock/epilogue_volta_tensor_op.cu   |   42 +-
 .../epilogue_with_reduction_tensor_op.cu      |   42 +-
 .../epilogue_with_reduction_testbed.h         |   42 +-
 .../epilogue_wmma_tensor_op_sm70.cu           |   42 +-
 .../threadblock/output_tile_threadmap.cu      |   42 +-
 .../threadblock/predicated_tile_iterator.cu   |   42 +-
 test/unit/epilogue/threadblock/testbed.h      |   42 +-
 .../threadblock/testbed_planar_complex.h      |   42 +-
 test/unit/epilogue/warp/CMakeLists.txt        |   42 +-
 .../warp/fragment_iterator_tensor_op.cu       |   42 +-
 .../warp/fragment_iterator_volta_tensor_op.cu |   42 +-
 .../warp/fragment_iterator_wmma_tensor_op.cu  |   42 +-
 test/unit/gemm/CMakeLists.txt                 |   42 +-
 test/unit/gemm/device/CMakeLists.txt          |  196 +-
 .../gemm_b1t_b1n_s32n_tensor_op_s32_sm75.cu   |   86 +-
 .../gemm_b1t_b1n_s32n_tensor_op_s32_sm80.cu   |   46 +-
 ...mm_b1t_b1n_s32n_wmma_tensor_op_s32_sm75.cu |   42 +-
 .../gemm_b1t_b1n_s32t_tensor_op_s32_sm75.cu   |   85 +-
 .../gemm_b1t_b1n_s32t_tensor_op_s32_sm80.cu   |   46 +-
 ...mm_b1t_b1n_s32t_wmma_tensor_op_s32_sm75.cu |   42 +-
 ...emm_bf16n_bf16n_f32t_tensor_op_f32_sm80.cu |   42 +-
 ...mm_bf16t_bf16t_bf16t_tensor_op_f32_sm80.cu |   42 +-
 ...32n_cf32t_cf32t_tensor_op_tf32_f32_sm80.cu |   42 +-
 ...32t_cf32n_cf32t_tensor_op_tf32_f32_sm80.cu |   42 +-
 ...cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu |   42 +-
 ...mm_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu |   42 +-
 ...cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu |   42 +-
 ...mm_cf64t_cf64n_cf64t_tensor_op_f64_sm80.cu |   42 +-
 ...6n_f16n_direct_store_tensor_op_f32_sm80.cu |   42 +-
 ..._f16n_f16n_f16n_wmma_tensor_op_f16_sm70.cu |   42 +-
 ..._f16n_f16n_f16n_wmma_tensor_op_f32_sm70.cu |   42 +-
 .../gemm_f16n_f16n_f16t_tensor_op_f32_sm75.cu |   42 +-
 .../gemm_f16n_f16n_f16t_tensor_op_f32_sm80.cu |   42 +-
 ...16n_f16n_f16t_tensor_op_f32_sparse_sm80.cu |   42 +-
 ...f16n_f16n_f16t_volta_tensor_op_f32_sm70.cu |   42 +-
 ..._f16n_f16n_f16t_wmma_tensor_op_f16_sm70.cu |   42 +-
 ..._f16n_f16n_f16t_wmma_tensor_op_f32_sm70.cu |   42 +-
 .../gemm_f16n_f16n_f32n_tensor_op_f32_sm75.cu |   42 +-
 .../gemm_f16n_f16n_f32n_tensor_op_f32_sm80.cu |   42 +-
 ..._f16n_f16n_f32n_wmma_tensor_op_f32_sm70.cu |   42 +-
 .../gemm_f16n_f16n_f32t_tensor_op_f32_sm75.cu |   42 +-
 .../gemm_f16n_f16n_f32t_tensor_op_f32_sm80.cu |   42 +-
 ...16n_f16n_f32t_tensor_op_f32_sparse_sm80.cu |   42 +-
 ...f16n_f16n_f32t_volta_tensor_op_f32_sm70.cu |   42 +-
 ..._f16n_f16n_f32t_wmma_tensor_op_f32_sm70.cu |   42 +-
 ..._f16n_f16t_f16n_wmma_tensor_op_f16_sm70.cu |   42 +-
 ..._f16n_f16t_f16n_wmma_tensor_op_f32_sm70.cu |   42 +-
 ...6n_f16t_f16t_tensor_op_f16_slicedk_sm75.cu |   42 +-
 ...6n_f16t_f16t_tensor_op_f16_slicedk_sm80.cu |   42 +-
 .../gemm_f16n_f16t_f16t_tensor_op_f16_sm75.cu |   42 +-
 .../gemm_f16n_f16t_f16t_tensor_op_f16_sm80.cu |   42 +-
 ...16n_f16t_f16t_tensor_op_f16_sparse_sm80.cu |   42 +-
 .../gemm_f16n_f16t_f16t_tensor_op_f32_sm80.cu |   42 +-
 ...f16n_f16t_f16t_volta_tensor_op_f16_sm70.cu |   42 +-
 ..._f16n_f16t_f16t_wmma_tensor_op_f16_sm70.cu |   42 +-
 ..._f16n_f16t_f16t_wmma_tensor_op_f32_sm70.cu |   42 +-
 ..._f16n_f16t_f32n_wmma_tensor_op_f32_sm70.cu |   42 +-
 .../gemm_f16n_f16t_f32t_tensor_op_f32_sm75.cu |   42 +-
 .../gemm_f16n_f16t_f32t_tensor_op_f32_sm80.cu |   42 +-
 ...16n_f16t_f32t_tensor_op_f32_sparse_sm80.cu |   42 +-
 ...f16n_f16t_f32t_volta_tensor_op_f32_sm70.cu |   42 +-
 ..._f16n_f16t_f32t_wmma_tensor_op_f32_sm70.cu |   42 +-
 ...16n_singlestage_wmma_tensor_op_f16_sm70.cu |   42 +-
 ..._f16t_f16n_f16n_wmma_tensor_op_f16_sm70.cu |   42 +-
 ..._f16t_f16n_f16n_wmma_tensor_op_f32_sm70.cu |   42 +-
 ...16t_singlestage_wmma_tensor_op_f16_sm70.cu |   42 +-
 ...6t_f16n_f16t_tensor_op_f16_slicedk_sm75.cu |   42 +-
 ...6t_f16n_f16t_tensor_op_f16_slicedk_sm80.cu |   42 +-
 .../gemm_f16t_f16n_f16t_tensor_op_f16_sm75.cu |   42 +-
 .../gemm_f16t_f16n_f16t_tensor_op_f16_sm80.cu |   42 +-
 ...16t_f16n_f16t_tensor_op_f16_sparse_sm80.cu |   42 +-
 ...f16t_f16n_f16t_volta_tensor_op_f16_sm70.cu |   42 +-
 ..._f16t_f16n_f16t_wmma_tensor_op_f16_sm70.cu |   42 +-
 ..._f16t_f16n_f16t_wmma_tensor_op_f32_sm70.cu |   42 +-
 ..._f16t_f16n_f32n_wmma_tensor_op_f32_sm70.cu |   42 +-
 ...32t_singlestage_wmma_tensor_op_f32_sm70.cu |   42 +-
 .../gemm_f16t_f16n_f32t_tensor_op_f32_sm75.cu |   42 +-
 .../gemm_f16t_f16n_f32t_tensor_op_f32_sm80.cu |   42 +-
 ...16t_f16n_f32t_tensor_op_f32_sparse_sm80.cu |   42 +-
 ...f16t_f16n_f32t_volta_tensor_op_f32_sm70.cu |   42 +-
 ..._f16t_f16n_f32t_wmma_tensor_op_f32_sm70.cu |   42 +-
 ..._f16t_f16t_f16n_wmma_tensor_op_f16_sm70.cu |   42 +-
 ..._f16t_f16t_f16n_wmma_tensor_op_f32_sm70.cu |   42 +-
 ..._f16t_f16t_f16t_wmma_tensor_op_f16_sm70.cu |   42 +-
 ..._f16t_f16t_f16t_wmma_tensor_op_f32_sm70.cu |   42 +-
 .../gemm_f16t_f16t_f32n_tensor_op_f32_sm75.cu |   42 +-
 .../gemm_f16t_f16t_f32n_tensor_op_f32_sm80.cu |   42 +-
 ..._f16t_f16t_f32n_wmma_tensor_op_f32_sm70.cu |   42 +-
 .../gemm_f16t_f16t_f32t_tensor_op_f32_sm75.cu |   42 +-
 .../gemm_f16t_f16t_f32t_tensor_op_f32_sm80.cu |   42 +-
 ...16t_f16t_f32t_tensor_op_f32_sparse_sm80.cu |   42 +-
 ...f16t_f16t_f32t_volta_tensor_op_f32_sm70.cu |   42 +-
 ..._f16t_f16t_f32t_wmma_tensor_op_f32_sm70.cu |   42 +-
 ..._f32n_f32n_f32t_tensor_op_bf16_f32_sm80.cu |   42 +-
 .../gemm_f32n_f32n_f32t_tensor_op_f32_sm80.cu |   42 +-
 ...32n_f32n_f32t_tensor_op_f32_sparse_sm80.cu |   42 +-
 ...32n_f32t_f32t_tensor_op_f32_sparse_sm80.cu |   42 +-
 ...32t_f32n_f32t_tensor_op_f32_sparse_sm80.cu |   42 +-
 ...32t_f32t_f32t_tensor_op_f32_sparse_sm80.cu |   42 +-
 .../gemm_f64n_f64t_f64t_tensor_op_f64_sm80.cu |   42 +-
 .../gemm_f64t_f64n_f64t_tensor_op_f64_sm80.cu |   42 +-
 test/unit/gemm/device/gemm_grouped_sm80.cu    |  297 ++-
 ...anar_complex_f16_f16_f32_tensor_op_sm70.cu |   42 +-
 ...anar_complex_f16_f16_f32_tensor_op_sm75.cu |   42 +-
 ...anar_complex_f16_f16_f32_tensor_op_sm80.cu |   42 +-
 .../gemm_s4n_s4t_s4n_tensor_op_s32_sm75.cu    |   42 +-
 .../gemm_s4n_s4t_s4n_tensor_op_s32_sm80.cu    |   42 +-
 .../gemm_s4t_s4n_s32n_tensor_op_s32_sm75.cu   |   42 +-
 .../gemm_s4t_s4n_s32n_tensor_op_s32_sm80.cu   |   46 +-
 ...mm_s4t_s4n_s32n_wmma_tensor_op_s32_sm75.cu |   42 +-
 .../gemm_s4t_s4n_s32t_tensor_op_s32_sm75.cu   |   42 +-
 .../gemm_s4t_s4n_s32t_tensor_op_s32_sm80.cu   |   46 +-
 ..._s4t_s4n_s32t_tensor_op_s32_sparse_sm80.cu |   46 +-
 ...mm_s4t_s4n_s32t_wmma_tensor_op_s32_sm75.cu |   42 +-
 .../gemm_s4t_s4n_s4n_tensor_op_s32_sm75.cu    |  107 +-
 .../gemm_s4t_s4n_s4n_tensor_op_s32_sm80.cu    |  374 +++
 .../gemm_s4t_s4n_s4t_tensor_op_s32_sm75.cu    |  117 +-
 .../gemm_s4t_s4n_s4t_tensor_op_s32_sm80.cu    |  374 +++
 .../gemm_s8n_s8t_s8n_tensor_op_s32_sm75.cu    |   42 +-
 .../gemm_s8n_s8t_s8n_tensor_op_s32_sm80.cu    |   42 +-
 .../gemm_s8t_s8n_s32n_tensor_op_s32_sm75.cu   |   42 +-
 .../gemm_s8t_s8n_s32n_tensor_op_s32_sm80.cu   |   46 +-
 ...mm_s8t_s8n_s32n_wmma_tensor_op_s32_sm72.cu |   42 +-
 .../gemm_s8t_s8n_s32t_tensor_op_s32_sm75.cu   |   42 +-
 .../gemm_s8t_s8n_s32t_tensor_op_s32_sm80.cu   |   46 +-
 ..._s8t_s8n_s32t_tensor_op_s32_sparse_sm80.cu |   46 +-
 ...mm_s8t_s8n_s32t_wmma_tensor_op_s32_sm72.cu |   42 +-
 .../gemm_s8t_s8n_s8n_tensor_op_s32_sm75.cu    |   46 +-
 .../gemm_s8t_s8n_s8n_tensor_op_s32_sm80.cu    |   46 +-
 ...emm_s8t_s8n_s8n_wmma_tensor_op_s32_sm72.cu |   42 +-
 .../gemm_s8t_s8n_s8t_tensor_op_s32_sm75.cu    |   46 +-
 .../gemm_s8t_s8n_s8t_tensor_op_s32_sm80.cu    |   46 +-
 ...emm_s8t_s8n_s8t_wmma_tensor_op_s32_sm72.cu |   42 +-
 .../gemm_splitk_serial_tensor_op_sm75.cu      |   47 +-
 .../unit/gemm/device/gemm_splitk_simt_sm50.cu |   42 +-
 .../gemm/device/gemm_splitk_tensor_op_sm70.cu |   42 +-
 .../gemm/device/gemm_splitk_tensor_op_sm75.cu |   42 +-
 ...emm_tf32n_tf32n_f32t_tensor_op_f32_sm80.cu |   42 +-
 ...emm_tf32n_tf32t_f32t_tensor_op_f32_sm80.cu |   42 +-
 ...emm_tf32t_tf32n_f32t_tensor_op_f32_sm80.cu |   42 +-
 ...emm_tf32t_tf32t_f32t_tensor_op_f32_sm80.cu |   42 +-
 ...mm_u8t_u8n_s32t_wmma_tensor_op_s32_sm72.cu |   42 +-
 ...al_cf32n_cf32n_cf32n_tensor_op_f32_sm80.cu |   42 +-
 ...cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu |   42 +-
 ...al_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu |   42 +-
 ...ersal_f16n_f16t_f32n_tensor_op_f32_sm75.cu |  117 +
 ...ersal_f16n_f16t_f32t_tensor_op_f32_sm75.cu |   50 +-
 ...adcast_f16n_f16n_f16n_tensorop_f32_sm75.cu |   42 +-
 ...uction_f16n_f16n_f16n_tensorop_f32_sm75.cu |   42 +-
 ...uction_f16t_f16n_f16n_tensorop_f32_sm80.cu |   42 +-
 test/unit/gemm/device/gemv.cu                 |   42 +-
 .../hemm_cf32h_cf32n_tensor_op_f32_ls_sm80.cu |  175 ++
 .../hemm_cf32h_cf32n_tensor_op_f32_rs_sm80.cu |  175 ++
 ..._cf32h_cf32n_tensor_op_fast_f32_ls_sm80.cu |  175 ++
 ..._cf32h_cf32n_tensor_op_fast_f32_rs_sm80.cu |  175 ++
 ...4n_cf64n_tensor_op_ls_f64_gaussian_sm80.cu |  175 ++
 ...cf64h_cf64n_cf64n_tensor_op_ls_f64_sm80.cu |  175 ++
 ...cf64h_cf64n_cf64n_tensor_op_rs_f64_sm80.cu |  175 ++
 .../her2k_cf32h_cf32n_tensor_op_f32_sm80.cu   |  149 ++
 ...r2k_cf32h_cf32n_tensor_op_fast_f32_sm80.cu |  149 ++
 .../her2k_cf64n_cf64n_tensor_op_f64_sm80.cu   |  149 ++
 .../her2k_cf64n_cf64t_tensor_op_f64_sm80.cu   |  201 ++
 .../herk_cf32h_cf32n_tensor_op_f32_sm80.cu    |  219 ++
 ...erk_cf32h_cf32n_tensor_op_fast_f32_sm80.cu |  219 ++
 .../herk_cf64h_cf64n_tensor_op_f64_sm80.cu    |  175 ++
 test/unit/gemm/device/multistage_testbed.h    |   42 +-
 .../device/multistage_testbed_interleaved.h   |   42 +-
 test/unit/gemm/device/simt_cgemm_nn_sm50.cu   |   42 +-
 test/unit/gemm/device/simt_cgemm_nt_sm50.cu   |   42 +-
 test/unit/gemm/device/simt_cgemm_tn_sm50.cu   |   42 +-
 test/unit/gemm/device/simt_cgemm_tt_sm50.cu   |   42 +-
 test/unit/gemm/device/simt_dgemm_nn_sm50.cu   |   42 +-
 test/unit/gemm/device/simt_dgemm_nt_sm50.cu   |   42 +-
 test/unit/gemm/device/simt_dgemm_tn_sm50.cu   |   42 +-
 test/unit/gemm/device/simt_dgemm_tt_sm50.cu   |   42 +-
 test/unit/gemm/device/simt_hgemm_nn_sm50.cu   |   42 +-
 test/unit/gemm/device/simt_hgemm_nt_sm50.cu   |   42 +-
 test/unit/gemm/device/simt_hgemm_tn_sm50.cu   |   42 +-
 test/unit/gemm/device/simt_hgemm_tt_sm50.cu   |   42 +-
 test/unit/gemm/device/simt_igemm_nn_sm50.cu   |   42 +-
 test/unit/gemm/device/simt_igemm_nt_sm50.cu   |   42 +-
 test/unit/gemm/device/simt_igemm_tn_sm50.cu   |   42 +-
 test/unit/gemm/device/simt_igemm_tt_sm50.cu   |   42 +-
 test/unit/gemm/device/simt_int8_igemm_sm61.cu |   42 +-
 .../gemm/device/simt_int8_igemm_sm61_perf.cu  |   42 +-
 .../device/simt_int8_igemm_sm61_sliced_k.cu   |   42 +-
 test/unit/gemm/device/simt_qgemm_nn_sm50.cu   |   42 +-
 test/unit/gemm/device/simt_qgemm_nt_sm50.cu   |   42 +-
 test/unit/gemm/device/simt_qgemm_tn_sm50.cu   |   42 +-
 test/unit/gemm/device/simt_qgemm_tt_sm50.cu   |   42 +-
 test/unit/gemm/device/simt_sgemm_nn_sm50.cu   |   42 +-
 test/unit/gemm/device/simt_sgemm_nt_sm50.cu   |   42 +-
 test/unit/gemm/device/simt_sgemm_nt_sm80.cu   |   42 +-
 test/unit/gemm/device/simt_sgemm_tn_sm50.cu   |   42 +-
 test/unit/gemm/device/simt_sgemm_tn_sm80.cu   |   42 +-
 test/unit/gemm/device/simt_sgemm_tt_sm50.cu   |   42 +-
 test/unit/gemm/device/simt_sm50.py            |   90 +-
 test/unit/gemm/device/simt_zgemm_nn_sm50.cu   |   42 +-
 test/unit/gemm/device/simt_zgemm_nt_sm50.cu   |   42 +-
 test/unit/gemm/device/simt_zgemm_tn_sm50.cu   |   42 +-
 test/unit/gemm/device/simt_zgemm_tt_sm50.cu   |   42 +-
 .../symm_cf32n_cf32n_tensor_op_f32_ls_sm80.cu |  172 ++
 .../symm_cf32n_cf32n_tensor_op_f32_rs_sm80.cu |  172 ++
 ..._cf32n_cf32n_tensor_op_fast_f32_ls_sm80.cu |  172 ++
 ..._cf32n_cf32n_tensor_op_fast_f32_rs_sm80.cu |  172 ++
 ...4n_cf64n_tensor_op_ls_f64_gaussian_sm80.cu |  172 ++
 ...cf64n_cf64n_cf64n_tensor_op_ls_f64_sm80.cu |  172 ++
 ...cf64n_cf64n_cf64n_tensor_op_rs_f64_sm80.cu |  172 ++
 ...mm_f32n_f32n_tensor_op_fast_f32_ls_sm80.cu |  489 ++++
 ...mm_f32n_f32n_tensor_op_fast_f32_rs_sm80.cu |  276 +++
 ...mm_f32t_f32t_tensor_op_fast_f32_ls_sm80.cu |  489 ++++
 .../symm_f64n_f64n_tensor_op_f64_ls_sm80.cu   |  258 ++
 .../symm_f64n_f64n_tensor_op_f64_rs_sm80.cu   |  258 ++
 .../symm_f64n_f64t_tensor_op_f64_ls_sm80.cu   |  258 ++
 .../symm_f64n_f64t_tensor_op_f64_rs_sm80.cu   |  258 ++
 .../symm_f64t_f64n_tensor_op_f64_ls_sm80.cu   |  258 ++
 .../symm_f64t_f64n_tensor_op_f64_rs_sm80.cu   |  258 ++
 .../symm_f64t_f64t_tensor_op_f64_ls_sm80.cu   |  258 ++
 .../symm_f64t_f64t_tensor_op_f64_rs_sm80.cu   |  258 ++
 .../symm_tf32n_f32n_tensor_op_f32_ls_sm80.cu  |  489 ++++
 .../symm_tf32n_f32n_tensor_op_f32_rs_sm80.cu  |  276 +++
 .../symm_tf32t_f32t_tensor_op_f32_ls_sm80.cu  |  489 ++++
 .../syr2k_cf32n_cf32n_tensor_op_f32_sm80.cu   |  150 ++
 ...r2k_cf32n_cf32n_tensor_op_fast_f32_sm80.cu |  150 ++
 .../syr2k_cf32n_cf32t_tensor_op_f32_sm80.cu   |  150 ++
 ...r2k_cf32n_cf32t_tensor_op_fast_f32_sm80.cu |  150 ++
 .../syr2k_cf64n_cf64n_tensor_op_f64_sm80.cu   |  150 ++
 .../syr2k_cf64n_cf64t_tensor_op_f64_sm80.cu   |  150 ++
 ...syr2k_f32n_f32n_tensor_op_fast_f32_sm80.cu |  132 +
 ...syr2k_f32t_f32n_tensor_op_fast_f32_sm80.cu |  133 +
 .../syr2k_f64n_f64n_tensor_op_f64_sm80.cu     |  253 ++
 .../syr2k_f64n_f64t_tensor_op_f64_sm80.cu     |  253 ++
 .../syr2k_f64t_f64n_tensor_op_f64_sm80.cu     |  253 ++
 .../syr2k_tf32n_f32n_tensor_op_f32_sm80.cu    |  132 +
 .../syr2k_tf32t_f32n_tensor_op_f32_sm80.cu    |  133 +
 .../syrk_cf32n_cf32n_tensor_op_f32_sm80.cu    |  137 ++
 ...yrk_cf32n_cf32n_tensor_op_fast_f32_sm80.cu |  137 ++
 .../syrk_cf32n_cf32t_tensor_op_f32_sm80.cu    |  137 ++
 ...yrk_cf32n_cf32t_tensor_op_fast_f32_sm80.cu |  137 ++
 .../syrk_cf64n_cf64n_tensor_op_f64_sm80.cu    |  136 ++
 ...cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu |   95 +
 .../syrk_cf64n_cf64t_tensor_op_f64_sm80.cu    |  136 ++
 .../syrk_f32n_f32t_tensor_op_fast_f32_sm80.cu |  541 +++++
 .../syrk_f32t_f32t_tensor_op_fast_f32_sm80.cu |  541 +++++
 .../syrk_f64n_f64t_tensor_op_f64_sm80.cu      |  237 ++
 .../syrk_f64t_f64n_tensor_op_f64_sm80.cu      |  301 +++
 .../syrk_tf32n_f32t_tensor_op_f32_sm80.cu     |  541 +++++
 .../syrk_tf32t_f32t_tensor_op_f32_sm80.cu     |  541 +++++
 test/unit/gemm/device/testbed.h               |   60 +-
 test/unit/gemm/device/testbed_complex.h       |   42 +-
 .../gemm/device/testbed_gemm_with_broadcast.h |   42 +-
 .../gemm/device/testbed_gemm_with_reduction.h |   42 +-
 test/unit/gemm/device/testbed_grouped.h       |   42 +-
 test/unit/gemm/device/testbed_interleaved.h   |   42 +-
 .../unit/gemm/device/testbed_planar_complex.h |   42 +-
 .../gemm/device/testbed_rank2k_universal.h    |  639 +++++
 .../gemm/device/testbed_rank_k_universal.h    |  509 ++++
 test/unit/gemm/device/testbed_sanity.h        |   42 +-
 test/unit/gemm/device/testbed_sparse.h        |   42 +-
 test/unit/gemm/device/testbed_splitk.h        |   42 +-
 .../unit/gemm/device/testbed_symm_universal.h |  589 +++++
 .../unit/gemm/device/testbed_trmm_universal.h |  606 +++++
 test/unit/gemm/device/testbed_universal.h     |   42 +-
 test/unit/gemm/device/testbed_utils.h         |   42 +-
 ...mm_cf32n_cf32n_cf32t_tensor_op_f32_sm80.cu |  305 +++
 ...32n_cf32n_cf32t_tensor_op_fast_f32_sm80.cu |  305 +++
 ...cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu |  137 ++
 ...mm_cf64n_cf64n_cf64t_tensor_op_f64_sm80.cu |  301 +++
 ...2n_f32t_f32t_tensor_op_fast_f32_ls_sm80.cu |  500 ++++
 ...2n_f32t_f32t_tensor_op_fast_f32_rs_sm80.cu |  252 ++
 ...2t_f32n_f32n_tensor_op_fast_f32_ls_sm80.cu |  449 ++++
 ...2t_f32n_f32t_tensor_op_fast_f32_ls_sm80.cu |  458 ++++
 ...mm_f64n_f64n_f64t_tensor_op_f64_ls_sm80.cu |  414 ++++
 ...mm_f64n_f64n_f64t_tensor_op_f64_rs_sm80.cu |  415 ++++
 ...mm_f64n_f64t_f64t_tensor_op_f64_rs_sm80.cu |  415 ++++
 ...mm_f64t_f64t_f64n_tensor_op_f64_ls_sm80.cu |  414 ++++
 ...mm_f64t_f64t_f64n_tensor_op_f64_rs_sm80.cu |  415 ++++
 ..._tf32n_tf32t_f32t_tensor_op_f32_ls_sm80.cu |  500 ++++
 ..._tf32n_tf32t_f32t_tensor_op_f32_rs_sm80.cu |  252 ++
 ..._tf32t_tf32n_f32n_tensor_op_f32_ls_sm80.cu |  449 ++++
 ..._tf32t_tf32n_f32t_tensor_op_f32_ls_sm80.cu |  458 ++++
 test/unit/gemm/kernel/batched_gemv.cu         |   42 +-
 test/unit/gemm/kernel/testbed_gemv.h          |   42 +-
 test/unit/gemm/thread/CMakeLists.txt          |   42 +-
 test/unit/gemm/thread/gemm_sm50.cu            |   42 +-
 test/unit/gemm/thread/gemm_sm60.cu            |   42 +-
 test/unit/gemm/thread/gemm_sm61.cu            |   42 +-
 test/unit/gemm/thread/host/CMakeLists.txt     |   42 +-
 test/unit/gemm/thread/host/gemm_sm60_host.cu  |   42 +-
 test/unit/gemm/thread/host/testbed_host.h     |   42 +-
 test/unit/gemm/thread/testbed.h               |   42 +-
 test/unit/gemm/threadblock/CMakeLists.txt     |   44 +-
 test/unit/gemm/threadblock/batched_gemv.cu    |   42 +-
 .../gemm/threadblock/epilogue_workspace.cu    |   42 +-
 test/unit/gemm/threadblock/mma_multistage.cu  |   42 +-
 .../threadblock/mma_multistage_slicedk.cu     |  111 +
 .../gemm/threadblock/mma_multistage_sparse.cu |   42 +-
 .../mma_multistage_sparse_testbed.h           |   77 +-
 .../gemm/threadblock/mma_multistage_testbed.h |   42 +-
 .../mma_multistage_testbed_slicedk.h          |  387 +++
 .../gemm/threadblock/mma_pipelined_simt.cu    |   42 +-
 .../gemm/threadblock/mma_pipelined_slicedk.cu |  186 ++
 .../gemm/threadblock/mma_pipelined_sm70.cu    |   42 +-
 .../gemm/threadblock/mma_pipelined_sm75.cu    |   42 +-
 .../gemm/threadblock/mma_pipelined_sm80.cu    |   42 +-
 .../gemm/threadblock/mma_pipelined_testbed.h  |   42 +-
 .../mma_pipelined_testbed_slicedk.h           |  370 +++
 .../threadblock/mma_pipelined_wmma_sm70.cu    |   42 +-
 .../threadblock/mma_pipelined_wmma_sm75.cu    |   42 +-
 .../threadblock/mma_planar_complex_sm80.cu    |   42 +-
 .../threadblock/mma_planar_complex_testbed.h  |   42 +-
 .../threadblock/mma_singlestage_wmma_sm70.cu  |   42 +-
 .../threadblock/mma_singlestage_wmma_sm75.cu  |   42 +-
 test/unit/gemm/warp/CMakeLists.txt            |   42 +-
 test/unit/gemm/warp/gemm_complex_sm80.cu      |   42 +-
 .../gemm/warp/gemm_gaussian_complex_sm80.cu   |   42 +-
 test/unit/gemm/warp/gemm_sm50.cu              |   42 +-
 test/unit/gemm/warp/gemm_sm60.cu              |   42 +-
 test/unit/gemm/warp/gemm_sm61.cu              |   42 +-
 test/unit/gemm/warp/gemm_sm70.cu              |   42 +-
 test/unit/gemm/warp/gemm_sm75.cu              |   42 +-
 test/unit/gemm/warp/gemm_sm80.cu              |   42 +-
 test/unit/gemm/warp/gemm_sparse_sm80.cu       |   42 +-
 test/unit/gemm/warp/testbed.h                 |   42 +-
 test/unit/gemm/warp/wmma_sm70.cu              |   42 +-
 test/unit/gemm/warp/wmma_sm72.cu              |   42 +-
 test/unit/gemm/warp/wmma_sm75.cu              |   42 +-
 test/unit/layout/CMakeLists.txt               |   42 +-
 test/unit/layout/matrix.cu                    |   48 +-
 test/unit/layout/tensor.cu                    |   48 +-
 test/unit/layout/tensor_nhwc.cu               |   48 +-
 test/unit/nvrtc/CMakeLists.txt                |   42 +-
 test/unit/nvrtc/cutlass/nvrtc/environment.h   |   42 +-
 .../unit/nvrtc/kernel/thread/testbed_kernel.h |   42 +-
 test/unit/nvrtc/stdlib/stdint.h               |   42 +-
 test/unit/nvrtc/thread/CMakeLists.txt         |   42 +-
 test/unit/nvrtc/thread/gemm_nvrtc.cu          |   42 +-
 test/unit/nvrtc/thread/testbed.h              |   42 +-
 test/unit/reduction/CMakeLists.txt            |   42 +-
 test/unit/reduction/device/CMakeLists.txt     |   42 +-
 .../device/tensor_reduce_contiguous.cu        |   42 +-
 .../reduction/device/tensor_reduce_strided.cu |   42 +-
 test/unit/reduction/kernel/CMakeLists.txt     |   42 +-
 test/unit/reduction/kernel/reduce_splitk.cu   |   42 +-
 .../reduction/kernel/reduce_splitk_testbed.h  |   42 +-
 test/unit/reduction/thread/CMakeLists.txt     |   42 +-
 .../unit/reduction/thread/reduction_thread.cu |   42 +-
 test/unit/reduction/thread/testbed.h          |   42 +-
 test/unit/test_unit.cpp                       |   42 +-
 test/unit/transform/CMakeLists.txt            |   42 +-
 .../unit/transform/threadblock/CMakeLists.txt |   42 +-
 .../threadblock/predicated_tile_iterator.cu   |   42 +-
 .../regular_tile_iterator_tensor_op.cu        |   42 +-
 test/unit/util/CMakeLists.txt                 |   42 +-
 test/unit/util/cutlass_test_levels.cu         |   42 +-
 test/unit/util/tensor_reduce.cu               |   42 +-
 tools/CMakeLists.txt                          |   42 +-
 tools/library/CMakeLists.txt                  |   42 +-
 .../include/cutlass/library/arch_mappings.h   |   42 +-
 .../library/include/cutlass/library/handle.h  |   42 +-
 .../library/include/cutlass/library/library.h |  466 +++-
 .../include/cutlass/library/manifest.h        |   42 +-
 .../include/cutlass/library/operation_table.h |   42 +-
 .../include/cutlass/library/singleton.h       |   42 +-
 tools/library/include/cutlass/library/util.h  |   63 +-
 tools/library/scripts/gemm_operation.py       |  331 ++-
 tools/library/scripts/generator.py            | 1503 ++++++++++--
 tools/library/scripts/library.py              |  133 +
 tools/library/scripts/manifest.py             |   96 +-
 tools/library/scripts/rank_2k_operation.py    |  398 +++
 tools/library/scripts/rank_k_operation.py     |  387 +++
 tools/library/scripts/rt.py                   |  796 ++++++
 tools/library/scripts/symm_operation.py       |  400 +++
 tools/library/scripts/trmm_operation.py       |  407 ++++
 tools/library/src/conv2d_operation.h          |   42 +-
 tools/library/src/conv3d_operation.h          |   42 +-
 tools/library/src/gemm_operation.h            |  244 +-
 tools/library/src/handle.cu                   |   43 +-
 tools/library/src/library_internal.h          |   51 +-
 tools/library/src/manifest.cpp                |   42 +-
 tools/library/src/operation_table.cu          |   43 +-
 tools/library/src/rank_2k_operation.h         |  373 +++
 tools/library/src/rank_k_operation.h          |  344 +++
 .../reduction/init_reduction_operations.cu    |   44 +-
 .../library/src/reduction/reduction_device.cu |   76 +-
 .../src/reduction/reduction_operation.h       |   42 +-
 tools/library/src/reference/conv2d.cu         |   42 +-
 tools/library/src/reference/conv3d.cu         |   42 +-
 .../src/reference/conv_reference_operation.h  |   42 +-
 tools/library/src/reference/gemm.cu           |   42 +-
 .../src/reference/gemm_reference_operation.h  |   42 +-
 .../initialize_reference_operations.cu        |   42 +-
 tools/library/src/singleton.cu                |   43 +-
 tools/library/src/symm_operation.h            |  379 +++
 tools/library/src/trmm_operation.h            |  346 +++
 tools/library/src/util.cu                     |  248 +-
 tools/profiler/CMakeLists.txt                 |   55 +-
 .../profiler/src/conv2d_operation_profiler.cu |   42 +-
 .../profiler/src/conv2d_operation_profiler.h  |   42 +-
 .../profiler/src/conv3d_operation_profiler.cu |   42 +-
 .../profiler/src/conv3d_operation_profiler.h  |   42 +-
 tools/profiler/src/cublas_helpers.cu          |  857 ++++++-
 tools/profiler/src/cublas_helpers.h           |  174 +-
 tools/profiler/src/cudnn_helpers.cpp          |   42 +-
 tools/profiler/src/cudnn_helpers.h            |   42 +-
 tools/profiler/src/cutlass_profiler.cu        |   56 +-
 tools/profiler/src/cutlass_profiler.h         |   42 +-
 tools/profiler/src/debug.h                    |   42 +-
 tools/profiler/src/device_allocation.cu       |   42 +-
 tools/profiler/src/device_allocation.h        |   42 +-
 tools/profiler/src/device_context.cu          |   42 +-
 tools/profiler/src/device_context.h           |   42 +-
 tools/profiler/src/enumerated_types.cpp       |   42 +-
 tools/profiler/src/enumerated_types.h         |   42 +-
 tools/profiler/src/gemm_operation_profiler.cu |   42 +-
 tools/profiler/src/gemm_operation_profiler.h  |   42 +-
 tools/profiler/src/gpu_timer.cpp              |   42 +-
 tools/profiler/src/gpu_timer.h                |   42 +-
 tools/profiler/src/main.cpp                   |   42 +-
 tools/profiler/src/operation_profiler.cu      |   42 +-
 tools/profiler/src/operation_profiler.h       |   42 +-
 tools/profiler/src/options.cu                 |   42 +-
 tools/profiler/src/options.h                  |   42 +-
 tools/profiler/src/performance_report.cpp     |   42 +-
 tools/profiler/src/performance_report.h       |   42 +-
 tools/profiler/src/performance_result.cu      |   42 +-
 tools/profiler/src/performance_result.h       |   42 +-
 tools/profiler/src/problem_space.cpp          |   42 +-
 tools/profiler/src/problem_space.h            |   42 +-
 .../src/rank_2k_operation_profiler.cu         |  727 ++++++
 .../profiler/src/rank_2k_operation_profiler.h |  229 ++
 .../profiler/src/rank_k_operation_profiler.cu |  715 ++++++
 .../profiler/src/rank_k_operation_profiler.h  |  227 ++
 .../src/reduction_operation_profiler.h        |   42 +-
 .../src/sparse_gemm_operation_profiler.cu     |   42 +-
 .../src/sparse_gemm_operation_profiler.h      |   42 +-
 tools/profiler/src/symm_operation_profiler.cu |  764 ++++++
 tools/profiler/src/symm_operation_profiler.h  |  230 ++
 tools/profiler/src/trmm_operation_profiler.cu |  704 ++++++
 tools/profiler/src/trmm_operation_profiler.h  |  222 ++
 tools/util/CMakeLists.txt                     |   42 +-
 .../util/include/cutlass/util/command_line.h  |   36 +-
 tools/util/include/cutlass/util/debug.h       |   42 +-
 tools/util/include/cutlass/util/device_dump.h |   42 +-
 .../util/include/cutlass/util/device_memory.h |   36 +-
 .../cutlass/util/device_nchw_to_nhwc.h        |  141 ++
 .../cutlass/util/device_nhwc_padding.h        |  276 +++
 .../cutlass/util/device_nhwc_to_nchw.h        |  144 ++
 .../util/include/cutlass/util/distribution.h  |   42 +-
 tools/util/include/cutlass/util/exceptions.h  |   36 +-
 .../util/include/cutlass/util/host_reorder.h  |   42 +-
 tools/util/include/cutlass/util/host_tensor.h |   42 +-
 .../cutlass/util/host_tensor_planar_complex.h |   42 +-
 .../include/cutlass/util/host_uncompress.h    |   42 +-
 .../include/cutlass/util/index_sequence.h     |   66 +-
 .../util/reference/detail/inner_product.h     |   42 +-
 .../reference/detail/linear_to_coordinate.h   |   42 +-
 .../util/reference/device/convolution.h       |   42 +-
 .../cutlass/util/reference/device/gemm.h      |   42 +-
 .../util/reference/device/gemm_complex.h      |  146 +-
 .../reference/device/gemm_planar_complex.h    |   42 +-
 .../util/reference/device/kernel/gemm.h       |   42 +-
 .../device/kernel/tensor_elementwise.h        |   42 +-
 .../reference/device/kernel/tensor_foreach.h  |   42 +-
 .../util/reference/device/rank_2k_complex.h   |  355 +++
 .../util/reference/device/tensor_compare.h    |   42 +-
 .../util/reference/device/tensor_fill.h       |  301 ++-
 .../util/reference/device/tensor_foreach.h    |   42 +-
 .../util/reference/device/tensor_reduce.h     |   42 +-
 .../util/reference/device/tensor_relu.h       |   42 +-
 .../util/reference/device/thread/gemm.h       |   42 +-
 .../cutlass/util/reference/host/convolution.h |   42 +-
 .../util/reference/host/error_metrics.h       |   42 +-
 .../cutlass/util/reference/host/gemm.h        |   42 +-
 .../util/reference/host/gemm_complex.h        |   42 +-
 .../util/reference/host/gemm_planar_complex.h |   42 +-
 .../cutlass/util/reference/host/rank_2k.h     |  261 ++
 .../util/reference/host/rank_2k_complex.h     |  318 +++
 .../util/reference/host/rank_k_complex.h      |  234 ++
 .../cutlass/util/reference/host/symm.h        |  285 +++
 .../util/reference/host/symm_complex.h        |  319 +++
 .../util/reference/host/tensor_compare.h      |   42 +-
 .../cutlass/util/reference/host/tensor_copy.h |   42 +-
 .../util/reference/host/tensor_elementwise.h  |   42 +-
 .../cutlass/util/reference/host/tensor_fill.h |  280 ++-
 .../util/reference/host/tensor_foreach.h      |   42 +-
 .../cutlass/util/reference/host/tensor_norm.h |   42 +-
 .../util/reference/host/tensor_reduce.h       |   42 +-
 .../cutlass/util/reference/host/trmm.h        |  215 ++
 .../util/reference/host/trmm_complex.h        |  262 ++
 .../include/cutlass/util/tensor_view_io.h     |   48 +-
 tools/util/include/cutlass/util/type_traits.h |   42 +-
 1100 files changed, 94860 insertions(+), 20427 deletions(-)
 delete mode 100644 .github/ISSUE_TEMPLATE/bug_report.md
 delete mode 100644 .github/ISSUE_TEMPLATE/documentation_request.md
 delete mode 100644 .github/ISSUE_TEMPLATE/feature_request.md
 delete mode 100644 .github/ISSUE_TEMPLATE/submit_question.md
 delete mode 100644 .github/workflows/labeler.yml
 delete mode 100644 .github/workflows/new-issues-to-triage-projects.yml
 delete mode 100644 .github/workflows/stale.yml
 delete mode 100644 examples/13_two_tensor_op_fusion/fused_conv2d.cu
 delete mode 100644 examples/13_two_tensor_op_fusion/fused_gemm.cu
 rename examples/13_two_tensor_op_fusion/{b2b_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm75.h => fused_two_convs_f16_sm75_rf.cu} (60%)
 create mode 100644 examples/13_two_tensor_op_fusion/fused_two_convs_f16_sm75_shmem.cu
 rename examples/13_two_tensor_op_fusion/{b2b_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.h => fused_two_convs_f16_sm80_rf.cu} (71%)
 create mode 100644 examples/13_two_tensor_op_fusion/fused_two_convs_f16_sm80_shmem.cu
 rename examples/13_two_tensor_op_fusion/{b2b_conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.h => fused_two_convs_s8_sm75_rf.cu} (61%)
 create mode 100644 examples/13_two_tensor_op_fusion/fused_two_convs_s8_sm75_shmem.cu
 rename examples/13_two_tensor_op_fusion/{b2b_conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm80.h => fused_two_convs_s8_sm80_rf.cu} (74%)
 create mode 100644 examples/13_two_tensor_op_fusion/fused_two_convs_s8_sm80_shmem.cu
 rename examples/13_two_tensor_op_fusion/{b2b_gemm_f16t_f16n_f16t_tensor_op_f16_sm75.h => fused_two_gemms_f16_sm75_rf.cu} (67%)
 create mode 100644 examples/13_two_tensor_op_fusion/fused_two_gemms_f16_sm75_shmem.cu
 rename examples/13_two_tensor_op_fusion/{b2b_gemm_f16t_f16n_f16t_tensor_op_f16_sm80.h => fused_two_gemms_f16_sm80_rf.cu} (74%)
 create mode 100644 examples/13_two_tensor_op_fusion/fused_two_gemms_f16_sm80_shmem.cu
 rename examples/13_two_tensor_op_fusion/{b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm75.h => fused_two_gemms_s8_sm75_rf.cu} (70%)
 create mode 100644 examples/13_two_tensor_op_fusion/fused_two_gemms_s8_sm75_shmem.cu
 rename examples/13_two_tensor_op_fusion/{b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm80.h => fused_two_gemms_s8_sm80_rf.cu} (72%)
 create mode 100644 examples/13_two_tensor_op_fusion/fused_two_gemms_s8_sm80_shmem.cu
 create mode 100644 examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop_sm75.h
 create mode 100644 examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop_sm80.h
 create mode 100644 examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop_smem_accumulator_sm75.h
 create mode 100644 examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop_smem_accumulator_sm80.h
 create mode 100644 examples/13_two_tensor_op_fusion/kernel/default_b2b_gemm_smem_accumulator.h
 create mode 100644 examples/13_two_tensor_op_fusion/test_run.h
 create mode 100644 examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_multistage_smem_accumulator.h
 create mode 100644 examples/13_two_tensor_op_fusion/threadblock/b2b_mma_multistage_smem_accumulator.h
 create mode 100644 examples/13_two_tensor_op_fusion/threadblock/b2b_mma_pipelined_smem_accumulator.h
 create mode 100644 examples/13_two_tensor_op_fusion/threadblock/default_b2b_mma_smem_accumulator.h
 create mode 100644 examples/31_basic_syrk/CMakeLists.txt
 create mode 100644 examples/31_basic_syrk/basic_syrk.cu
 create mode 100644 examples/32_basic_trmm/CMakeLists.txt
 create mode 100644 examples/32_basic_trmm/basic_trmm.cu
 create mode 100644 examples/33_ampere_3xtf32_tensorop_symm/CMakeLists.txt
 create mode 100644 examples/33_ampere_3xtf32_tensorop_symm/ampere_3xtf32_tensorop_symm.cu
 create mode 100644 examples/35_gemm_softmax/CMakeLists.txt
 create mode 100644 examples/35_gemm_softmax/epilogue_with_visitor.h
 create mode 100644 examples/35_gemm_softmax/gemm_softmax.cu
 create mode 100644 examples/35_gemm_softmax/gemm_with_epilogue_visitor.h
 create mode 100644 examples/35_gemm_softmax/gemm_with_softmax.h
 create mode 100644 examples/36_gather_scatter_fusion/CMakeLists.txt
 create mode 100644 examples/36_gather_scatter_fusion/gather_scatter_fusion.cu
 create mode 100644 examples/40_cutlass_py/test-cutlass-py.py
 create mode 100644 include/cutlass/blas3.h
 create mode 100644 include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_few_channels.h
 create mode 100644 include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_fixed_channels.h
 create mode 100644 include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_few_channels.h
 create mode 100644 include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_fixed_channels.h
 create mode 100644 include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op_blas3.h
 create mode 100644 include/cutlass/epilogue/threadblock/default_epilogue_tensor_op_blas3.h
 create mode 100644 include/cutlass/epilogue/threadblock/predicated_tile_iterator_blas3.h
 create mode 100644 include/cutlass/gemm/device/rank_2k.h
 create mode 100644 include/cutlass/gemm/device/rank_k.h
 create mode 100755 include/cutlass/gemm/device/symm.h
 create mode 100644 include/cutlass/gemm/device/trmm.h
 create mode 100644 include/cutlass/gemm/kernel/default_rank_2k.h
 create mode 100644 include/cutlass/gemm/kernel/default_rank_2k_complex.h
 create mode 100644 include/cutlass/gemm/kernel/default_rank_2k_universal.h
 create mode 100644 include/cutlass/gemm/kernel/default_rank_k.h
 create mode 100644 include/cutlass/gemm/kernel/default_rank_k_complex.h
 create mode 100644 include/cutlass/gemm/kernel/default_rank_k_universal.h
 create mode 100755 include/cutlass/gemm/kernel/default_symm.h
 create mode 100755 include/cutlass/gemm/kernel/default_symm_complex.h
 create mode 100755 include/cutlass/gemm/kernel/default_symm_universal.h
 create mode 100644 include/cutlass/gemm/kernel/default_trmm.h
 create mode 100644 include/cutlass/gemm/kernel/default_trmm_complex.h
 create mode 100644 include/cutlass/gemm/kernel/default_trmm_universal.h
 create mode 100644 include/cutlass/gemm/kernel/rank_2k_universal.h
 create mode 100644 include/cutlass/gemm/kernel/rank_k_universal.h
 create mode 100755 include/cutlass/gemm/kernel/symm_universal.h
 create mode 100644 include/cutlass/gemm/kernel/trmm_universal.h
 create mode 100644 include/cutlass/gemm/threadblock/default_mma_core_with_access_size.h
 create mode 100644 include/cutlass/gemm/threadblock/default_multistage_trmm_complex.h
 create mode 100644 include/cutlass/gemm/threadblock/default_trmm.h
 create mode 100644 include/cutlass/gemm/threadblock/mma_blas3_multistage.h
 create mode 100644 include/cutlass/transform/threadblock/predicated_tile_access_iterator_triangular_matrix.h
 create mode 100644 include/cutlass/transform/threadblock/predicated_tile_iterator_triangular_matrix.h
 create mode 100644 media/images/13_example_shmem_resident_fusion.png
 create mode 100644 test/unit/conv/device/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
 create mode 100644 test/unit/conv/device/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
 create mode 100644 test/unit/gemm/device/gemm_s4t_s4n_s4n_tensor_op_s32_sm80.cu
 create mode 100644 test/unit/gemm/device/gemm_s4t_s4n_s4t_tensor_op_s32_sm80.cu
 create mode 100644 test/unit/gemm/device/gemm_universal_f16n_f16t_f32n_tensor_op_f32_sm75.cu
 create mode 100644 test/unit/gemm/device/hemm_cf32h_cf32n_tensor_op_f32_ls_sm80.cu
 create mode 100644 test/unit/gemm/device/hemm_cf32h_cf32n_tensor_op_f32_rs_sm80.cu
 create mode 100644 test/unit/gemm/device/hemm_cf32h_cf32n_tensor_op_fast_f32_ls_sm80.cu
 create mode 100644 test/unit/gemm/device/hemm_cf32h_cf32n_tensor_op_fast_f32_rs_sm80.cu
 create mode 100644 test/unit/gemm/device/hemm_cf64h_cf64n_cf64n_tensor_op_ls_f64_gaussian_sm80.cu
 create mode 100644 test/unit/gemm/device/hemm_cf64h_cf64n_cf64n_tensor_op_ls_f64_sm80.cu
 create mode 100644 test/unit/gemm/device/hemm_cf64h_cf64n_cf64n_tensor_op_rs_f64_sm80.cu
 create mode 100644 test/unit/gemm/device/her2k_cf32h_cf32n_tensor_op_f32_sm80.cu
 create mode 100644 test/unit/gemm/device/her2k_cf32h_cf32n_tensor_op_fast_f32_sm80.cu
 create mode 100644 test/unit/gemm/device/her2k_cf64n_cf64n_tensor_op_f64_sm80.cu
 create mode 100644 test/unit/gemm/device/her2k_cf64n_cf64t_tensor_op_f64_sm80.cu
 create mode 100644 test/unit/gemm/device/herk_cf32h_cf32n_tensor_op_f32_sm80.cu
 create mode 100644 test/unit/gemm/device/herk_cf32h_cf32n_tensor_op_fast_f32_sm80.cu
 create mode 100644 test/unit/gemm/device/herk_cf64h_cf64n_tensor_op_f64_sm80.cu
 create mode 100644 test/unit/gemm/device/symm_cf32n_cf32n_tensor_op_f32_ls_sm80.cu
 create mode 100644 test/unit/gemm/device/symm_cf32n_cf32n_tensor_op_f32_rs_sm80.cu
 create mode 100644 test/unit/gemm/device/symm_cf32n_cf32n_tensor_op_fast_f32_ls_sm80.cu
 create mode 100644 test/unit/gemm/device/symm_cf32n_cf32n_tensor_op_fast_f32_rs_sm80.cu
 create mode 100644 test/unit/gemm/device/symm_cf64n_cf64n_cf64n_tensor_op_ls_f64_gaussian_sm80.cu
 create mode 100644 test/unit/gemm/device/symm_cf64n_cf64n_cf64n_tensor_op_ls_f64_sm80.cu
 create mode 100644 test/unit/gemm/device/symm_cf64n_cf64n_cf64n_tensor_op_rs_f64_sm80.cu
 create mode 100644 test/unit/gemm/device/symm_f32n_f32n_tensor_op_fast_f32_ls_sm80.cu
 create mode 100644 test/unit/gemm/device/symm_f32n_f32n_tensor_op_fast_f32_rs_sm80.cu
 create mode 100644 test/unit/gemm/device/symm_f32t_f32t_tensor_op_fast_f32_ls_sm80.cu
 create mode 100644 test/unit/gemm/device/symm_f64n_f64n_tensor_op_f64_ls_sm80.cu
 create mode 100644 test/unit/gemm/device/symm_f64n_f64n_tensor_op_f64_rs_sm80.cu
 create mode 100644 test/unit/gemm/device/symm_f64n_f64t_tensor_op_f64_ls_sm80.cu
 create mode 100644 test/unit/gemm/device/symm_f64n_f64t_tensor_op_f64_rs_sm80.cu
 create mode 100644 test/unit/gemm/device/symm_f64t_f64n_tensor_op_f64_ls_sm80.cu
 create mode 100644 test/unit/gemm/device/symm_f64t_f64n_tensor_op_f64_rs_sm80.cu
 create mode 100644 test/unit/gemm/device/symm_f64t_f64t_tensor_op_f64_ls_sm80.cu
 create mode 100644 test/unit/gemm/device/symm_f64t_f64t_tensor_op_f64_rs_sm80.cu
 create mode 100644 test/unit/gemm/device/symm_tf32n_f32n_tensor_op_f32_ls_sm80.cu
 create mode 100644 test/unit/gemm/device/symm_tf32n_f32n_tensor_op_f32_rs_sm80.cu
 create mode 100644 test/unit/gemm/device/symm_tf32t_f32t_tensor_op_f32_ls_sm80.cu
 create mode 100644 test/unit/gemm/device/syr2k_cf32n_cf32n_tensor_op_f32_sm80.cu
 create mode 100644 test/unit/gemm/device/syr2k_cf32n_cf32n_tensor_op_fast_f32_sm80.cu
 create mode 100644 test/unit/gemm/device/syr2k_cf32n_cf32t_tensor_op_f32_sm80.cu
 create mode 100644 test/unit/gemm/device/syr2k_cf32n_cf32t_tensor_op_fast_f32_sm80.cu
 create mode 100644 test/unit/gemm/device/syr2k_cf64n_cf64n_tensor_op_f64_sm80.cu
 create mode 100644 test/unit/gemm/device/syr2k_cf64n_cf64t_tensor_op_f64_sm80.cu
 create mode 100644 test/unit/gemm/device/syr2k_f32n_f32n_tensor_op_fast_f32_sm80.cu
 create mode 100644 test/unit/gemm/device/syr2k_f32t_f32n_tensor_op_fast_f32_sm80.cu
 create mode 100644 test/unit/gemm/device/syr2k_f64n_f64n_tensor_op_f64_sm80.cu
 create mode 100644 test/unit/gemm/device/syr2k_f64n_f64t_tensor_op_f64_sm80.cu
 create mode 100644 test/unit/gemm/device/syr2k_f64t_f64n_tensor_op_f64_sm80.cu
 create mode 100644 test/unit/gemm/device/syr2k_tf32n_f32n_tensor_op_f32_sm80.cu
 create mode 100644 test/unit/gemm/device/syr2k_tf32t_f32n_tensor_op_f32_sm80.cu
 create mode 100644 test/unit/gemm/device/syrk_cf32n_cf32n_tensor_op_f32_sm80.cu
 create mode 100644 test/unit/gemm/device/syrk_cf32n_cf32n_tensor_op_fast_f32_sm80.cu
 create mode 100644 test/unit/gemm/device/syrk_cf32n_cf32t_tensor_op_f32_sm80.cu
 create mode 100644 test/unit/gemm/device/syrk_cf32n_cf32t_tensor_op_fast_f32_sm80.cu
 create mode 100644 test/unit/gemm/device/syrk_cf64n_cf64n_tensor_op_f64_sm80.cu
 create mode 100644 test/unit/gemm/device/syrk_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu
 create mode 100644 test/unit/gemm/device/syrk_cf64n_cf64t_tensor_op_f64_sm80.cu
 create mode 100644 test/unit/gemm/device/syrk_f32n_f32t_tensor_op_fast_f32_sm80.cu
 create mode 100644 test/unit/gemm/device/syrk_f32t_f32t_tensor_op_fast_f32_sm80.cu
 create mode 100644 test/unit/gemm/device/syrk_f64n_f64t_tensor_op_f64_sm80.cu
 create mode 100644 test/unit/gemm/device/syrk_f64t_f64n_tensor_op_f64_sm80.cu
 create mode 100644 test/unit/gemm/device/syrk_tf32n_f32t_tensor_op_f32_sm80.cu
 create mode 100644 test/unit/gemm/device/syrk_tf32t_f32t_tensor_op_f32_sm80.cu
 create mode 100644 test/unit/gemm/device/testbed_rank2k_universal.h
 create mode 100644 test/unit/gemm/device/testbed_rank_k_universal.h
 create mode 100644 test/unit/gemm/device/testbed_symm_universal.h
 create mode 100644 test/unit/gemm/device/testbed_trmm_universal.h
 create mode 100644 test/unit/gemm/device/trmm_cf32n_cf32n_cf32t_tensor_op_f32_sm80.cu
 create mode 100644 test/unit/gemm/device/trmm_cf32n_cf32n_cf32t_tensor_op_fast_f32_sm80.cu
 create mode 100644 test/unit/gemm/device/trmm_cf64n_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu
 create mode 100644 test/unit/gemm/device/trmm_cf64n_cf64n_cf64t_tensor_op_f64_sm80.cu
 create mode 100644 test/unit/gemm/device/trmm_f32n_f32t_f32t_tensor_op_fast_f32_ls_sm80.cu
 create mode 100644 test/unit/gemm/device/trmm_f32n_f32t_f32t_tensor_op_fast_f32_rs_sm80.cu
 create mode 100644 test/unit/gemm/device/trmm_f32t_f32n_f32n_tensor_op_fast_f32_ls_sm80.cu
 create mode 100644 test/unit/gemm/device/trmm_f32t_f32n_f32t_tensor_op_fast_f32_ls_sm80.cu
 create mode 100644 test/unit/gemm/device/trmm_f64n_f64n_f64t_tensor_op_f64_ls_sm80.cu
 create mode 100644 test/unit/gemm/device/trmm_f64n_f64n_f64t_tensor_op_f64_rs_sm80.cu
 create mode 100644 test/unit/gemm/device/trmm_f64n_f64t_f64t_tensor_op_f64_rs_sm80.cu
 create mode 100644 test/unit/gemm/device/trmm_f64t_f64t_f64n_tensor_op_f64_ls_sm80.cu
 create mode 100644 test/unit/gemm/device/trmm_f64t_f64t_f64n_tensor_op_f64_rs_sm80.cu
 create mode 100644 test/unit/gemm/device/trmm_tf32n_tf32t_f32t_tensor_op_f32_ls_sm80.cu
 create mode 100644 test/unit/gemm/device/trmm_tf32n_tf32t_f32t_tensor_op_f32_rs_sm80.cu
 create mode 100644 test/unit/gemm/device/trmm_tf32t_tf32n_f32n_tensor_op_f32_ls_sm80.cu
 create mode 100644 test/unit/gemm/device/trmm_tf32t_tf32n_f32t_tensor_op_f32_ls_sm80.cu
 create mode 100644 test/unit/gemm/threadblock/mma_multistage_slicedk.cu
 create mode 100644 test/unit/gemm/threadblock/mma_multistage_testbed_slicedk.h
 create mode 100644 test/unit/gemm/threadblock/mma_pipelined_slicedk.cu
 create mode 100644 test/unit/gemm/threadblock/mma_pipelined_testbed_slicedk.h
 create mode 100644 tools/library/scripts/rank_2k_operation.py
 create mode 100644 tools/library/scripts/rank_k_operation.py
 create mode 100644 tools/library/scripts/rt.py
 create mode 100644 tools/library/scripts/symm_operation.py
 create mode 100644 tools/library/scripts/trmm_operation.py
 create mode 100644 tools/library/src/rank_2k_operation.h
 create mode 100644 tools/library/src/rank_k_operation.h
 create mode 100644 tools/library/src/symm_operation.h
 create mode 100644 tools/library/src/trmm_operation.h
 create mode 100644 tools/profiler/src/rank_2k_operation_profiler.cu
 create mode 100644 tools/profiler/src/rank_2k_operation_profiler.h
 create mode 100644 tools/profiler/src/rank_k_operation_profiler.cu
 create mode 100644 tools/profiler/src/rank_k_operation_profiler.h
 create mode 100644 tools/profiler/src/symm_operation_profiler.cu
 create mode 100644 tools/profiler/src/symm_operation_profiler.h
 create mode 100644 tools/profiler/src/trmm_operation_profiler.cu
 create mode 100644 tools/profiler/src/trmm_operation_profiler.h
 create mode 100644 tools/util/include/cutlass/util/device_nchw_to_nhwc.h
 create mode 100644 tools/util/include/cutlass/util/device_nhwc_padding.h
 create mode 100644 tools/util/include/cutlass/util/device_nhwc_to_nchw.h
 create mode 100644 tools/util/include/cutlass/util/reference/device/rank_2k_complex.h
 create mode 100644 tools/util/include/cutlass/util/reference/host/rank_2k.h
 create mode 100644 tools/util/include/cutlass/util/reference/host/rank_2k_complex.h
 create mode 100644 tools/util/include/cutlass/util/reference/host/rank_k_complex.h
 create mode 100644 tools/util/include/cutlass/util/reference/host/symm.h
 create mode 100644 tools/util/include/cutlass/util/reference/host/symm_complex.h
 create mode 100644 tools/util/include/cutlass/util/reference/host/trmm.h
 create mode 100644 tools/util/include/cutlass/util/reference/host/trmm_complex.h

diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
deleted file mode 100644
index 86322616..00000000
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ /dev/null
@@ -1,23 +0,0 @@
----
-name: Bug report
-about: Create a bug report to help us improve CUTLASS
-title: "[BUG]"
-labels: "? - Needs Triage, bug"
-assignees: ''
-
----
-
-**Describe the bug**
-A clear and concise description of what the bug is.
-
-**Steps/Code to reproduce bug**
-Follow this guide http://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports to craft a minimal bug report. This helps us reproduce the issue you're having and resolve the issue more quickly.
-
-**Expected behavior**
-A clear and concise description of what you expected to happen.
-
-**Environment details (please complete the following information):**
- - Environment location: [Bare-metal, Docker, Cloud(specify cloud provider)] 
-
-**Additional context**
-Add any other context about the problem here.
\ No newline at end of file
diff --git a/.github/ISSUE_TEMPLATE/documentation_request.md b/.github/ISSUE_TEMPLATE/documentation_request.md
deleted file mode 100644
index 9e96105f..00000000
--- a/.github/ISSUE_TEMPLATE/documentation_request.md
+++ /dev/null
@@ -1,35 +0,0 @@
----
-name: Documentation request
-about: Report incorrect or needed documentation to improve CUTLASS
-title: "[DOC]"
-labels: "? - Needs Triage, documentation"
-assignees: ''
-
----
-
-## Report incorrect documentation
-
-**Location of incorrect documentation**
-Provide links and line numbers if applicable.
-
-**Describe the problems or issues found in the documentation**
-A clear and concise description of what you found to be incorrect.
-
-**Steps taken to verify documentation is incorrect**
-List any steps you have taken:
-
-**Suggested fix for documentation**
-Detail proposed changes to fix the documentation if you have any.
-
----
-
-## Report needed documentation
-
-**Report needed documentation**
-A clear and concise description of what documentation you believe it is needed and why.
-
-**Describe the documentation you'd like**
-A clear and concise description of what you want to happen.
-
-**Steps taken to search for needed documentation**
-List any steps you have taken:
\ No newline at end of file
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
deleted file mode 100644
index 9ea8e6de..00000000
--- a/.github/ISSUE_TEMPLATE/feature_request.md
+++ /dev/null
@@ -1,20 +0,0 @@
----
-name: Feature request
-about: Suggest an idea for CUTLASS
-title: "[FEA]"
-labels: "? - Needs Triage, feature request"
-assignees: ''
-
----
-
-**Is your feature request related to a problem? Please describe.**
-A clear and concise description of what the problem is. Ex. I wish I could use CUTLASS to do [...]
-
-**Describe the solution you'd like**
-A clear and concise description of what you want to happen.
-
-**Describe alternatives you've considered**
-A clear and concise description of any alternative solutions or features you've considered.
-
-**Additional context**
-Add any other context, code examples, or references to existing implementations about the feature request here.
diff --git a/.github/ISSUE_TEMPLATE/submit_question.md b/.github/ISSUE_TEMPLATE/submit_question.md
deleted file mode 100644
index 743f893f..00000000
--- a/.github/ISSUE_TEMPLATE/submit_question.md
+++ /dev/null
@@ -1,10 +0,0 @@
----
-name: Submit question
-about: Ask a general question about CUTLASS
-title: "[QST]"
-labels: "? - Needs Triage, question"
-assignees: ''
-
----
-
-**What is your question?**
\ No newline at end of file
diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml
deleted file mode 100644
index 6510938e..00000000
--- a/.github/workflows/labeler.yml
+++ /dev/null
@@ -1,11 +0,0 @@
-name: "Pull Request Labeler"
-on:
-- pull_request_target
-
-jobs:
-  triage:
-    runs-on: ubuntu-latest
-    steps:
-    - uses: actions/labeler@main
-      with:
-        repo-token: "${{ secrets.GITHUB_TOKEN }}"
\ No newline at end of file
diff --git a/.github/workflows/new-issues-to-triage-projects.yml b/.github/workflows/new-issues-to-triage-projects.yml
deleted file mode 100644
index 3049176e..00000000
--- a/.github/workflows/new-issues-to-triage-projects.yml
+++ /dev/null
@@ -1,35 +0,0 @@
-name: Auto Assign New Issues to Triage Project
-
-on:
-  issues:
-    types: [opened]
-
-env:
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
-jobs:
-  assign_one_project:
-    runs-on: ubuntu-latest
-    name: Assign to New Issues to Triage Project
-    steps:
-    - name: Process bug issues
-      uses: docker://takanabe/github-actions-automate-projects:v0.0.1
-      if: contains(github.event.issue.labels.*.name, 'bug') && contains(github.event.issue.labels.*.name, '? - Needs Triage')
-      env:
-        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        GITHUB_PROJECT_URL: https://github.com/NVIDIA/cutlass
-        GITHUB_PROJECT_COLUMN_NAME: 'Needs prioritizing'
-    - name: Process feature issues
-      uses: docker://takanabe/github-actions-automate-projects:v0.0.1
-      if: contains(github.event.issue.labels.*.name, 'feature request') && contains(github.event.issue.labels.*.name, '? - Needs Triage')
-      env:
-        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        GITHUB_PROJECT_URL: https://github.com/NVIDIA/cutlass
-        GITHUB_PROJECT_COLUMN_NAME: 'Needs prioritizing'
-    - name: Process other issues
-      uses: docker://takanabe/github-actions-automate-projects:v0.0.1
-      if: contains(github.event.issue.labels.*.name, '? - Needs Triage') && (!contains(github.event.issue.labels.*.name, 'bug') && !contains(github.event.issue.labels.*.name, 'feature request'))
-      env:
-        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        GITHUB_PROJECT_URL: https://github.com/NVIDIA/cutlass
-        GITHUB_PROJECT_COLUMN_NAME: 'Needs prioritizing'
\ No newline at end of file
diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
deleted file mode 100644
index cb2e7275..00000000
--- a/.github/workflows/stale.yml
+++ /dev/null
@@ -1,57 +0,0 @@
-name: Mark inactive issues and pull requests
-
-on:
-  schedule:
-    - cron: "0 * * * *"
-
-jobs:
-  mark-inactive-30d:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Mark 30 day inactive issues and pull requests
-        uses: actions/stale@v3
-        with:
-          repo-token: ${{ secrets.GITHUB_TOKEN }}
-          stale-issue-message: >
-            This issue has been labeled `inactive-30d` due to no recent activity in the past 30 days.
-            Please close this issue if no further response or action is needed.
-            Otherwise, please respond with a comment indicating any updates or changes to the original issue and/or confirm this issue still needs to be addressed.
-            This issue will be labeled `inactive-90d` if there is no activity in the next 60 days.
-          stale-issue-label: "inactive-30d"
-          exempt-issue-labels: "0 - Blocked,0 - Backlog,good first issue"
-          days-before-issue-stale: 30
-          days-before-issue-close: -1
-          stale-pr-message: >
-            This PR has been labeled `inactive-30d` due to no recent activity in the past 30 days.
-            Please close this PR if it is no longer required.
-            Otherwise, please respond with a comment indicating any updates.
-            This PR will be labeled `inactive-90d` if there is no activity in the next 60 days.
-          stale-pr-label: "inactive-30d"
-          exempt-pr-labels: "0 - Blocked,0 - Backlog,good first issue"
-          days-before-pr-stale: 30
-          days-before-pr-close: -1
-          operations-per-run: 50
-  mark-inactive-90d:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Mark 90 day inactive issues and pull requests
-        uses: actions/stale@v3
-        with:
-          repo-token: ${{ secrets.GITHUB_TOKEN }}
-          stale-issue-message: >
-            This issue has been labeled `inactive-90d` due to no recent activity in the past 90 days.
-            Please close this issue if no further response or action is needed.
-            Otherwise, please respond with a comment indicating any updates or changes to the original issue and/or confirm this issue still needs to be addressed.
-          stale-issue-label: "inactive-90d"
-          exempt-issue-labels: "0 - Blocked,0 - Backlog,good first issue"
-          days-before-issue-stale: 90
-          days-before-issue-close: -1
-          stale-pr-message: >
-            This PR has been labeled `inactive-90d` due to no recent activity in the past 90 days.
-            Please close this PR if it is no longer required.
-            Otherwise, please respond with a comment indicating any updates.
-          stale-pr-label: "inactive-90d"
-          exempt-pr-labels: "0 - Blocked,0 - Backlog,good first issue"
-          days-before-pr-stale: 90
-          days-before-pr-close: -1
-          operations-per-run: 50
\ No newline at end of file
diff --git a/CHANGELOG.md b/CHANGELOG.md
index d83d451d..ecfd594d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,26 @@
 # NVIDIA CUTLASS Changelog
 
+## [2.9.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.9.0) (2022-04-21)
+
+* [First layer Convolution kernels](/test/unit/conv/device/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu) specialized for small channel counts and reduced alignment
+  * [Few channels](/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_few_channels.h) specialization for reduced alignment capabilities
+  * [Fixed channels](/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_fixed_channels.h) further specialized when channel count perfectly matches the access vector size
+  * [Unit tests](/test/unit/conv/device/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu)
+  * [Python-based instance emitter](/tools/library/scripts/generator.py) in the CUTLASS Library and support in the Profiler
+* [BLAS3](https://docs.nvidia.com/cuda/cublas/index.html#cublas-level-3-function-reference) operators accelerated by Tensor Cores
+  * Supported types: f32, cf32, f64, cf64
+  * [HERK](/test/unit/gemm/device/her2k_cf32h_cf32n_tensor_op_fast_f32_sm80.cu) with [emitter](/tools/library/scripts/rank_k_operation.py)
+  * [SYRK](/test/unit/gemm/device/syrk_f32n_f32t_tensor_op_fast_f32_sm80.cu) with [emitter](/tools/library/scripts/rank_k_operation.py)
+  * [SYMM](/test/unit/gemm/device/symm_f32n_f32n_tensor_op_fast_f32_ls_sm80.cu) with [emitter](/tools/library/scripts/symm_operation.py)
+  * [TRMM](/test/unit/gemm/device/trmm_f32n_f32t_f32t_tensor_op_fast_f32_ls_sm80.cu) with [emitter](/tools/library/scripts/trmm_operation.py)
+  * [Unit tests](/test/unit/gemm/device/testbed_rank_k_universal.h)
+* [CUTLASS Python](/example/40_cutlass_py) demonstrating JIT compilation of CUTLASS kernels and a Python-based runtime using [CUDA Python](https://developer.nvidia.com/cuda-python)
+  * [Python-based runtime](/tools/library/scripts/rt.py) interoperable with existing emitters
+* [GEMM + Softmax example](/examples/35_gemm_softmax)
+* Optimal performance using [**CUDA 11.6u2**](https://developer.nvidia.com/cuda-downloads)
+* Updates and bugfixes from the community (thanks!)
+
+
 ## [2.8.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.8.0) (2021-11-19)
 
 * **TF32x3:** emulated single-precision using Tensor Cores
@@ -23,7 +44,6 @@
   * Ubuntu 16.04
   * CUDA 10.2
 
-
 ## [2.7.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.7.0) (2021-09-24)
   * Mainloop fusion for GEMM: [summation over A or B](/examples/23_ampere_gemm_operand_reduction_fusion/ampere_gemm_operand_reduction_fusion.cu)
   * [Strided DGRAD (optimized iterators)](/include/cutlass/conv/kernel/default_conv2d_dgrad.h)
@@ -210,27 +230,33 @@
 
 ## Copyright
 
-Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
 
 ```
-  Redistribution and use in source and binary forms, with or without modification, are permitted
-  provided that the following conditions are met:
-      * Redistributions of source code must retain the above copyright notice, this list of
-        conditions and the following disclaimer.
-      * Redistributions in binary form must reproduce the above copyright notice, this list of
-        conditions and the following disclaimer in the documentation and/or other materials
-        provided with the distribution.
-      * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-        to endorse or promote products derived from this software without specific prior written
-        permission.
-
-  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-  IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-  FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-  OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-  STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+  1. Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+  2. Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+  3. Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ```
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3538b6c1..7b7284b9 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,23 +1,29 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 cmake_minimum_required(VERSION 3.12.4 FATAL_ERROR)
@@ -32,7 +38,7 @@ endif()
 
 message(STATUS "CMake Version: ${CMAKE_VERSION}")
 
-project(CUTLASS VERSION 2.8.0 LANGUAGES CXX)
+project(CUTLASS VERSION 2.9.0 LANGUAGES CXX)
 include(${CMAKE_CURRENT_SOURCE_DIR}/CUDA.cmake)
 
 if (CUDA_VERSION VERSION_LESS 10.2)
@@ -83,7 +89,7 @@ set(CUTLASS_ENABLE_LIBRARY ${CUTLASS_ENABLE_LIBRARY_INIT} CACHE BOOL "Enable CUT
 set(CUTLASS_ENABLE_PROFILER ${CUTLASS_ENABLE_LIBRARY} CACHE BOOL "Enable CUTLASS Profiler")
 
 if(${CMAKE_PROJECT_NAME} STREQUAL ${PROJECT_NAME})
-  set(CUTLASS_ENABLE_TESTS_INIT ${CUTLASS_ENABLE_LIBRARY})
+  set(CUTLASS_ENABLE_TESTS_INIT ${CUTLASS_ENABLE_LIBRARY}})
 else()
   set(CUTLASS_ENABLE_TESTS_INIT OFF)
 endif()
@@ -187,11 +193,9 @@ set(CUTLASS_LIBRARY_OPERATIONS "all" CACHE STRING "Comma delimited list of opera
 set(CUTLASS_LIBRARY_KERNELS "" CACHE STRING "Comma delimited list of kernel name filters. If unspecified, only the largest tile size is enabled. If 'all' is specified, all kernels are enabled.")
 set(CUTLASS_LIBRARY_IGNORE_KERNELS "" CACHE STRING "Comma delimited list of kernel names to exclude from build.")
 
-
 # Test Levels L0, L1, L2
 set(CUTLASS_TEST_LEVEL "0" CACHE STRING "Level of tests to compile.")
 
-
 set(CUTLASS_TEST_ENABLE_CACHED_RESULTS ON CACHE BOOL "Enable caching and reuse of test results in unit tests")
 
 set_property(CACHE CUTLASS_TEST_LEVEL PROPERTY STRINGS 0 1 2)
@@ -203,7 +207,6 @@ if (CUTLASS_TEST_ENABLE_CACHED_RESULTS)
   list(APPEND CUTLASS_CUDA_NVCC_FLAGS -DCUTLASS_TEST_ENABLE_CACHED_RESULTS=1)
 endif()
 
-
 set(CUTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED ON CACHE BOOL "Enable/Disable rigorous conv problem sizes in conv unit tests")
 
 if (CUTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED)
@@ -211,7 +214,6 @@ if (CUTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED)
   list(APPEND CUTLASS_CUDA_NVCC_FLAGS -DCUTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED=1)
 endif()
 
-
 #
 # CUDA 10.1 introduces "mma" in PTX performing collective matrix multiply operations.
 #
@@ -748,7 +750,7 @@ if (CUTLASS_INSTALL_TESTS)
 
   install(
     FILES "${CMAKE_BINARY_DIR}/cmake/CTestTestfile.cmake"
-    DESTINATION ${CUTLASS_TEST_INSTALL_PREFIX}/
+    DESTINATION "${CUTLASS_TEST_INSTALL_PREFIX}/"
     )
 
 endif()
diff --git a/CUDA.cmake b/CUDA.cmake
index db1978a5..ff6a6afc 100644
--- a/CUDA.cmake
+++ b/CUDA.cmake
@@ -1,23 +1,29 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 if(CUDA_COMPILER MATCHES "[Cc]lang")
@@ -213,8 +219,7 @@ function(cutlass_correct_source_file_language_property)
   endif()
 endfunction()
 
-# If building with all kernels, set UNITY build on by default.
-if (CUTLASS_LIBRARY_KERNELS MATCHES "all")
+if (MSVC OR CUTLASS_LIBRARY_KERNELS MATCHES "all")
   set(CUTLASS_UNITY_BUILD_ENABLED_INIT ON)
 else()
   set(CUTLASS_UNITY_BUILD_ENABLED_INIT OFF)
diff --git a/LICENSE.txt b/LICENSE.txt
index 64a49d68..d9219ec9 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -1,23 +1,27 @@
-Copyright (c) 2017 - 2020, NVIDIA CORPORATION.  All rights reserved.
+Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
-   *  Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-   *  Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-   *  Neither the name of the NVIDIA CORPORATION nor the
-      names of its contributors may be used to endorse or promote products
-      derived from this software without specific prior written permission.
 
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
-DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+1. Redistributions of source code must retain the above copyright notice, this
+list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/PUBLICATIONS.md b/PUBLICATIONS.md
index a5eb903f..5c36742a 100644
--- a/PUBLICATIONS.md
+++ b/PUBLICATIONS.md
@@ -4,7 +4,7 @@
 
 - ["Bolt: Bridging the Gap between Auto-tuners and Hardware-native Performance"](https://arxiv.org/abs/2110.15238). Jiarong Xing, Leyuan Wang, Shang Zhang, Jack Chen, Ang Chen, Yibo Zhu. _Proceedings of the 5th MLSys Conference_, August 2022.
 
-- ["Recovering single precision accuracy from Tensor Cores while surpassing the FP32 theoretical peak performance"](https://arxiv.org/abs/2203.03341). Hiroyuki Ootomo,  Rio Yokota. _International Journal of High Performance Computing_, March 2022.
+- ["Recovering single precision accuracy from Tensor Cores while surpassing the FP32 theoretical peak performance"](https://arxiv.org/abs/2203.03341). Hiroyuki Ootomo, Rio Yokota. _International Journal of High Performance Computing_, March 2022.
 
 ## 2021
 
diff --git a/README.md b/README.md
index 6d1b1905..8540dd9a 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,8 @@
 ![ALT](/media/images/gemm-hierarchy-with-epilogue-no-labels.png "Complete CUDA GEMM decomposition")
 
-# CUTLASS 2.8
+# CUTLASS 2.9
 
-_CUTLASS 2.8 - November 2021_
+_CUTLASS 2.9 - April 2022_
 
 CUTLASS is a collection of CUDA C++ template abstractions for implementing
 high-performance matrix-multiplication (GEMM) and related computations at all levels 
@@ -34,18 +34,23 @@ See the [Quick Start Guide](/media/docs/quickstart.md) to get started quickly.
 See the [functionality listing](/media/docs/functionality.md) for the list of operations
 supported at each level of the execution model hierarchy.
 
-# What's New in CUTLASS 2.8
-CUTLASS 2.8 is an update to CUTLASS adding:
-- [TF32x3:](/examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm) emulated single-precision using Tensor Cores; 45+ TFLOPs on NVIDIA A100
-- [Mainloop fusion for Convolution:](/examples/25_ampere_fprop_mainloop_fusion) convolution with fused per-channel bias-add
-- [Grouped GEMM:](/examples/24_gemm_grouped) similar to batched GEMM with distinct problem size per group
-- [Implicit GEMM Convolution fusion](/examples/13_two_tensor_op_fusion/) supports staging 1st convolution's output accumulator in the shared memory on Turing.
-- Optimal performance using [CUDA 11.5](https://developer.nvidia.com/cuda-downloads)
+# What's New in CUTLASS 2.9
+
+CUTLASS 2.9 is an update to CUTLASS adding:
+- [First layer Convolution kernels](/test/unit/conv/device/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu) specialized for small channel counts and reduced alignment
+- [BLAS3](https://docs.nvidia.com/cuda/cublas/index.html#cublas-level-3-function-reference) operators accelerated by Tensor Cores
+  - [SYRK](/test/unit/gemm/device/syrk_f32n_f32t_tensor_op_fast_f32_sm80.cu), [HERK](/test/unit/gemm/device/herk_cf32h_cf32n_tensor_op_fast_f32_sm80.cu),
+  - [SYR2K](/test/unit/gemm/device/syr2k_f32n_f32n_tensor_op_fast_f32_sm80.cu), [HER2K](/test/unit/gemm/device/her2k_cf32h_cf32n_tensor_op_fast_f32_sm80.cu),
+  - [Out-of-place TRMM](/test/unit/gemm/device/trmm_f32n_f32t_f32t_tensor_op_fast_f32_ls_sm80.cu), and 
+  - [SYMM](/test/unit/gemm/device/symm_f32n_f32n_tensor_op_fast_f32_ls_sm80.cu), [HEMM](/test/unit/gemm/device/hemm_cf32h_cf32n_tensor_op_fast_f32_ls_sm80.cu)
+- [CUTLASS Python](/example/40_cutlass_py) demonstrating JIT compilation of CUTLASS kernels and a Python-based runtime using [CUDA Python](https://developer.nvidia.com/cuda-python)
+- [GEMM + Softmax example](/examples/35_gemm_softmax)
+- Optimal performance using [CUDA 11.6u2](https://developer.nvidia.com/cuda-downloads)
+- Updates and bugfixes from the community (thanks!)
 - **Deprecation announcement:** CUTLASS plans to deprecate the following:
   - Maxwell and Pascal GPU architectures
   - Ubuntu 16.04
   - CUDA 10.2
-- Updates and bugfixes from the community (thanks!)
 
 **See the [CHANGELOG](CHANGELOG.md) for a detailed listing of releases and updates.**
 
@@ -66,8 +71,8 @@ compiled with the [CUDA 11.5 Toolkit](https://developer.nvidia.com/cuda-download
 # Compatibility
 
 CUTLASS requires a C++11 host compiler and 
-performs best when built with the [CUDA 11.5 Toolkit](https://developer.nvidia.com/cuda-toolkit).
-It is also compatible with CUDA 11.0, CUDA 11.1, CUDA 11.2, CUDA 11.3, and CUDA 11.4.
+performs best when built with the [**CUDA 11.6u2 Toolkit**](https://developer.nvidia.com/cuda-toolkit).
+It is also compatible with CUDA 11.0, CUDA 11.1, CUDA 11.2, CUDA 11.3, CUDA 11.4, and CUDA 11.5.
 
 We have tested the following environments.
 
@@ -75,8 +80,10 @@ We have tested the following environments.
 |-----------------|----------|
 | Windows 10      | Microsoft Visual Studio 2015|
 |                 | Microsoft Visual Studio 2017|
+|                 | Microsoft Visual Studio 2019|
 | Ubuntu 18.04 | GCC 7.5.0 |
 | Ubuntu 20.04 | GCC 10.3.0 |
+| Ubuntu 21.04 | GCC 11.2.0 |
 
 Additionally, CUTLASS may be built with clang. 
 See [these instructions](media/docs/quickstart.md#clang) for more details.
@@ -84,10 +91,7 @@ See [these instructions](media/docs/quickstart.md#clang) for more details.
 CUTLASS runs successfully on the following NVIDIA GPUs, and it is expected to be efficient on
 any Volta-, Turing-, or NVIDIA Ampere- architecture NVIDIA GPU. 
 
-For all GPUs, we recommend compiling with the [**CUDA 11.5 Toolkit**](https://developer.nvidia.com/cuda-toolkit)
-for best performance. 
-
-|**GPU**|**CUDA Compute Capability**|**Minimum CUDA Toolkit**|**CUDA Toolkit Enabling Native Tensor Cores**|
+|**GPU**|**CUDA Compute Capability**|**Minimum CUDA Toolkit**|**Minimum CUDA Toolkit Enabling Native Tensor Cores**|
 |---|---|---|---|
 |NVIDIA Tesla V100|7.0|9.2|10.1|
 |NVIDIA TitanV|7.0|9.2|10.1|
@@ -97,6 +101,9 @@ for best performance.
 |NVIDIA A10 |8.6|11.1|11.1|
 |NVIDIA GeForce 3090|8.6|11.1|11.1|
 
+For all GPUs, we recommend compiling with the [CUDA 11.6u2 Toolkit](https://developer.nvidia.com/cuda-toolkit)
+for best performance.
+
 # Documentation
 
 CUTLASS is described in the following documents and the accompanying
@@ -230,6 +237,16 @@ examples/
   13_fused_two_gemms/              # example demonstrating two GEMms fused in one kernel
 
   22_ampere_tensorop_conv2dfprop/  # example demonstrating integer implicit GEMM convolution (forward propagation) using Ampere Tensor Cores
+
+  31_basic_syrk                    # example demonstrating Symetric rank-K update
+
+  32_basic_trmm                    #
+
+  33_ampere_3xtf32_tensorop_symm   #
+
+  35_gemm_softmax                  # example demonstrating GEMM fused with Softmax in mixed precision using Ampere Tensor Cores
+
+  40_cutlass_py                    # example demonstrating CUTLASS with CUDA Python
 ```
 
 ### Tools
@@ -485,27 +502,33 @@ The official list of CUTLASS developers and contributors is available here: [CON
 
 # Copyright
 
-Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
 
 ```
-  Redistribution and use in source and binary forms, with or without modification, are permitted
-  provided that the following conditions are met:
-      * Redistributions of source code must retain the above copyright notice, this list of
-        conditions and the following disclaimer.
-      * Redistributions in binary form must reproduce the above copyright notice, this list of
-        conditions and the following disclaimer in the documentation and/or other materials
-        provided with the distribution.
-      * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-        to endorse or promote products derived from this software without specific prior written
-        permission.
-
-  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-  IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-  FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-  OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-  STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+  1. Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+  2. Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+  3. Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ```
 
diff --git a/cmake/nop.cu b/cmake/nop.cu
index 77216e5c..f4775572 100644
--- a/cmake/nop.cu
+++ b/cmake/nop.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/cuBLAS.cmake b/cuBLAS.cmake
index fd2b8799..a66274eb 100644
--- a/cuBLAS.cmake
+++ b/cuBLAS.cmake
@@ -1,23 +1,29 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 message(STATUS "Configuring cublas ...")
diff --git a/cuDNN.cmake b/cuDNN.cmake
index ca20806b..4f89f43d 100644
--- a/cuDNN.cmake
+++ b/cuDNN.cmake
@@ -1,24 +1,29 @@
-
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 if(DEFINED CUDNN_ENABLED)
diff --git a/examples/00_basic_gemm/CMakeLists.txt b/examples/00_basic_gemm/CMakeLists.txt
index 38d93d11..5af8fcf3 100644
--- a/examples/00_basic_gemm/CMakeLists.txt
+++ b/examples/00_basic_gemm/CMakeLists.txt
@@ -1,25 +1,33 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+
 cutlass_example_add_executable(
   00_basic_gemm
   basic_gemm.cu
diff --git a/examples/00_basic_gemm/basic_gemm.cu b/examples/00_basic_gemm/basic_gemm.cu
index f3d37259..7c633b30 100644
--- a/examples/00_basic_gemm/basic_gemm.cu
+++ b/examples/00_basic_gemm/basic_gemm.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/examples/01_cutlass_utilities/CMakeLists.txt b/examples/01_cutlass_utilities/CMakeLists.txt
index dbed841b..56733038 100644
--- a/examples/01_cutlass_utilities/CMakeLists.txt
+++ b/examples/01_cutlass_utilities/CMakeLists.txt
@@ -1,25 +1,33 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+
 cutlass_example_add_executable(
   01_cutlass_utilities
   cutlass_utilities.cu
diff --git a/examples/01_cutlass_utilities/cutlass_utilities.cu b/examples/01_cutlass_utilities/cutlass_utilities.cu
index ead40ea6..30ec2841 100644
--- a/examples/01_cutlass_utilities/cutlass_utilities.cu
+++ b/examples/01_cutlass_utilities/cutlass_utilities.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/examples/02_dump_reg_shmem/CMakeLists.txt b/examples/02_dump_reg_shmem/CMakeLists.txt
index 45388ae0..9dd94ab4 100644
--- a/examples/02_dump_reg_shmem/CMakeLists.txt
+++ b/examples/02_dump_reg_shmem/CMakeLists.txt
@@ -1,25 +1,33 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+
 cutlass_example_add_executable(
   02_dump_reg_shmem 
   dump_reg_shmem.cu
diff --git a/examples/02_dump_reg_shmem/dump_reg_shmem.cu b/examples/02_dump_reg_shmem/dump_reg_shmem.cu
index 41dec4b7..159b0b49 100644
--- a/examples/02_dump_reg_shmem/dump_reg_shmem.cu
+++ b/examples/02_dump_reg_shmem/dump_reg_shmem.cu
@@ -1,27 +1,31 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
- *modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice,
- *this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *notice, this list of conditions and the following disclaimer in the
- *documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its
- *contributors may be used to endorse or promote products derived from this
- *software without specific prior written permission.
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- *DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY DIRECT,
- *INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- *DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- *OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- *NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
- *EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 
diff --git a/examples/03_visualize_layout/CMakeLists.txt b/examples/03_visualize_layout/CMakeLists.txt
index 7df55fd1..27c38249 100644
--- a/examples/03_visualize_layout/CMakeLists.txt
+++ b/examples/03_visualize_layout/CMakeLists.txt
@@ -1,25 +1,33 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+
 set(TEST_COMMAND_00 RowMajor --extent=16,16)
 set(TEST_COMMAND_01 \"ColumnMajorInterleaved<4>\" --extent=32,8 --output-shape=16 --vectorize=4)
 
diff --git a/examples/03_visualize_layout/options.h b/examples/03_visualize_layout/options.h
index f69f96ab..2b1d8fdb 100644
--- a/examples/03_visualize_layout/options.h
+++ b/examples/03_visualize_layout/options.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/examples/03_visualize_layout/register_layout.cu b/examples/03_visualize_layout/register_layout.cu
index 2d3e4b90..060abe35 100644
--- a/examples/03_visualize_layout/register_layout.cu
+++ b/examples/03_visualize_layout/register_layout.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -55,27 +61,49 @@ void RegisterLayouts(std::map<std::string, std::unique_ptr<VisualizeLayoutBase>
        new VisualizeLayout<cutlass::layout::ColumnMajorInterleaved<4>>},
       {"RowMajorInterleaved<4>",
        new VisualizeLayout<cutlass::layout::RowMajorInterleaved<4>>},
+      // All Ampere/Turing H/Integer matrix multiply tensor core kernels uses the same swizzling
+      // layout implementation with different templates.
+      //
+      // BMMA 88128  Interleaved-256
+      // BMMA 168256 Interleaved-256
+      {"TensorOpMultiplicand<1,256>",
+       new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<1, 256>>},
+      // BMMA 88128  TN kblock512
+      // BMMA 168256 TN kblock512
+      {"TensorOpMultiplicand<1,512>",
+       new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<1, 512>>},
+      // BMMA 168256 TN kblock1024
+      {"TensorOpMultiplicand<1,1024>",
+       new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<1, 1024>>},
       // Integer matrix multiply.int4 8832  Interleaved-64
+      // Integer matrix multiply.int4 16864 Interleaved-64
       {"TensorOpMultiplicand<4,64>",
        new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<4, 64>>},
       // Integer matrix multiply.int4 8832  TN kblock128
+      // Integer matrix multiply.int4 16864 TN kblock128
       {"TensorOpMultiplicand<4,128>",
        new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<4, 128>>},
       // Integer matrix multiply.int4 16864 TN kblock256
       {"TensorOpMultiplicand<4,256>",
        new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<4, 256>>},
       // Integer matrix multiply 8816  Interleaved-32
+      // Integer matrix multiply 16832 Interleaved-32
       {"TensorOpMultiplicand<8,32>",
        new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<8, 32>>},
       // Integer matrix multiply 8816  TN kblock64
+      // Integer matrix multiply 16832 TN kblock64
       {"TensorOpMultiplicand<8,64>",
        new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<8, 64>>},
+      // Integer matrix multiply 16832 TN kblock128
       {"TensorOpMultiplicand<8,128>",
        new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<8, 128>>},
       // Matrix Multiply 1688  TN kblock32
+      // Matrix multiply 16816 TN kblock32
       {"TensorOpMultiplicand<16,32>",
        new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<16, 32>>},
       // Matrix multiply 1688  NT
+      // Matrix multiply 16816 NT
+      // Matrix multiply 16816 TN kblock64
       {"TensorOpMultiplicand<16,64>",
        new VisualizeLayout<cutlass::layout::TensorOpMultiplicand<16, 64>>},
       // Matrix multiply 1688.TF32 TN kblock16
diff --git a/examples/03_visualize_layout/register_layout.h b/examples/03_visualize_layout/register_layout.h
index 23a45611..b473279a 100644
--- a/examples/03_visualize_layout/register_layout.h
+++ b/examples/03_visualize_layout/register_layout.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/examples/03_visualize_layout/visualize_layout.cpp b/examples/03_visualize_layout/visualize_layout.cpp
index cb52da2f..81be3290 100644
--- a/examples/03_visualize_layout/visualize_layout.cpp
+++ b/examples/03_visualize_layout/visualize_layout.cpp
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/examples/03_visualize_layout/visualize_layout.h b/examples/03_visualize_layout/visualize_layout.h
index f4214678..7c9a1bc2 100644
--- a/examples/03_visualize_layout/visualize_layout.h
+++ b/examples/03_visualize_layout/visualize_layout.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/examples/04_tile_iterator/CMakeLists.txt b/examples/04_tile_iterator/CMakeLists.txt
index 0732545e..52e1d665 100644
--- a/examples/04_tile_iterator/CMakeLists.txt
+++ b/examples/04_tile_iterator/CMakeLists.txt
@@ -1,25 +1,33 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+
 cutlass_example_add_executable(
   04_tile_iterator
   tile_iterator.cu
diff --git a/examples/04_tile_iterator/tile_iterator.cu b/examples/04_tile_iterator/tile_iterator.cu
index 9048dfec..8fc19319 100644
--- a/examples/04_tile_iterator/tile_iterator.cu
+++ b/examples/04_tile_iterator/tile_iterator.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/examples/05_batched_gemm/CMakeLists.txt b/examples/05_batched_gemm/CMakeLists.txt
index 5857ca31..f42e76b2 100644
--- a/examples/05_batched_gemm/CMakeLists.txt
+++ b/examples/05_batched_gemm/CMakeLists.txt
@@ -1,25 +1,33 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+
 cutlass_example_add_executable(
   05_batched_gemm
   batched_gemm.cu
diff --git a/examples/05_batched_gemm/batched_gemm.cu b/examples/05_batched_gemm/batched_gemm.cu
index e39d405b..2ce552c7 100644
--- a/examples/05_batched_gemm/batched_gemm.cu
+++ b/examples/05_batched_gemm/batched_gemm.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -34,7 +40,7 @@
 #pragma warning( disable : 4503)
 
 /*
-This example demonstrates how to use cutlass to compute a batched gemm in two different ways:
+This example demonstrates how to use cutlass to compute a batched strided gemm in two different ways:
   1. By specifying pointers to the first matrices of the batch and the stride between the consecutive
      matrices of the batch (this is called a strided batched gemm).
   2. By copying pointers to all matrices of the batch to the device memory (this is called an array gemm).
@@ -231,6 +237,7 @@ cudaError_t strided_batched_gemm_nn_reference(
   return result;
 }
 
+
 cudaError_t run_batched_gemm(bool use_array) {
 
   const char* gemm_desc = use_array ? "array" : "strided batched";
diff --git a/examples/06_splitK_gemm/CMakeLists.txt b/examples/06_splitK_gemm/CMakeLists.txt
index 728b257c..04d7af8c 100644
--- a/examples/06_splitK_gemm/CMakeLists.txt
+++ b/examples/06_splitK_gemm/CMakeLists.txt
@@ -1,25 +1,33 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+
 cutlass_example_add_executable(
   06_splitK_gemm
   splitk_gemm.cu
diff --git a/examples/06_splitK_gemm/splitk_gemm.cu b/examples/06_splitK_gemm/splitk_gemm.cu
index 3dd87bce..03e60788 100644
--- a/examples/06_splitK_gemm/splitk_gemm.cu
+++ b/examples/06_splitK_gemm/splitk_gemm.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/examples/07_volta_tensorop_gemm/CMakeLists.txt b/examples/07_volta_tensorop_gemm/CMakeLists.txt
index b0074829..c53367ac 100644
--- a/examples/07_volta_tensorop_gemm/CMakeLists.txt
+++ b/examples/07_volta_tensorop_gemm/CMakeLists.txt
@@ -1,25 +1,33 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+
 cutlass_example_add_executable(
   07_volta_tensorop_gemm
   volta_tensorop_gemm.cu
diff --git a/examples/07_volta_tensorop_gemm/volta_tensorop_gemm.cu b/examples/07_volta_tensorop_gemm/volta_tensorop_gemm.cu
index efb71b82..eecd86cd 100644
--- a/examples/07_volta_tensorop_gemm/volta_tensorop_gemm.cu
+++ b/examples/07_volta_tensorop_gemm/volta_tensorop_gemm.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/examples/08_turing_tensorop_gemm/CMakeLists.txt b/examples/08_turing_tensorop_gemm/CMakeLists.txt
index 3353dca0..c84bc6d4 100644
--- a/examples/08_turing_tensorop_gemm/CMakeLists.txt
+++ b/examples/08_turing_tensorop_gemm/CMakeLists.txt
@@ -1,25 +1,33 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+
 cutlass_example_add_executable(
   08_turing_tensorop_gemm
   turing_tensorop_gemm.cu
diff --git a/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu b/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu
index 57c63536..1869c41f 100644
--- a/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu
+++ b/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/examples/09_turing_tensorop_conv2dfprop/CMakeLists.txt b/examples/09_turing_tensorop_conv2dfprop/CMakeLists.txt
index 47630cdd..09057a28 100644
--- a/examples/09_turing_tensorop_conv2dfprop/CMakeLists.txt
+++ b/examples/09_turing_tensorop_conv2dfprop/CMakeLists.txt
@@ -1,26 +1,34 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
+
 cutlass_example_add_executable(
   09_turing_tensorop_conv2dfprop
   turing_tensorop_conv2dfprop.cu
diff --git a/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu b/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu
index c0700fd4..b7391216 100644
--- a/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu
+++ b/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -333,21 +339,21 @@ struct Options {
       << "  forward convolution on tensors of layout NHWC.\n\n"
       << "Options:\n\n"
       << "  --help               If specified, displays this usage statement.\n\n"
-      << "  --n <int>            Input tensor extent N\n"
-      << "  --h <int>            Input tensor extent H\n"
-      << "  --w <int>            Input tensor extent W\n"
-      << "  --c <int>            Input tensor extent C\n"
-      << "  --k <int>            Filter extent K\n"
-      << "  --r <int>            Filter extent R\n"
-      << "  --s <int>            Filter extent S\n\n"
-      << "  --alpha <float>      Epilogue scalar alpha\n"
-      << "  --beta <float>       Epilogue scalar beta\n\n"
+      << "  --n=<int>            Input tensor extent N\n"
+      << "  --h=<int>            Input tensor extent H\n"
+      << "  --w=<int>            Input tensor extent W\n"
+      << "  --c=<int>            Input tensor extent C\n"
+      << "  --k=<int>            Filter extent K\n"
+      << "  --r=<int>            Filter extent R\n"
+      << "  --s=<int>            Filter extent S\n\n"
+      << "  --alpha=<float>      Epilogue scalar alpha\n"
+      << "  --beta=<float>       Epilogue scalar beta\n\n"
       << "  --ref-check          If set (true), reference check on the host is computed\n"
       << "  --perf-check         If set (true), performance is measured.\n"
       << "  --benchmark          If set (true), performance benchmarking on several layers and batch-size.\n"
-      << "  --iterations <int>   Number of profiling iterations to perform.\n"
+      << "  --iterations=<int>   Number of profiling iterations to perform.\n"
       << "  --save-workspace     If set, workspace is written to a text file.\n"
-      << "  --tag <string>       String to replicate across the first column in the results table\n";
+      << "  --tag=<string>       String to replicate across the first column in the results table\n";
 
     out << "\n\nExamples:\n\n"
       << "$ ./examples/09_turing_tensorop_conv2dfprop/09_turing_tensorop_conv2dfprop  --n=32 --h=224 --w=224 --c=128 --k=256 --r=1 --s=1\n\n"
diff --git a/examples/10_planar_complex/CMakeLists.txt b/examples/10_planar_complex/CMakeLists.txt
index 2fec5831..eaf41fde 100644
--- a/examples/10_planar_complex/CMakeLists.txt
+++ b/examples/10_planar_complex/CMakeLists.txt
@@ -1,26 +1,34 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
+
 # Planar Complex GEMM example
 cutlass_example_add_executable(
   10_planar_complex
diff --git a/examples/10_planar_complex/planar_complex.cu b/examples/10_planar_complex/planar_complex.cu
index 4bfec118..c7dc7b27 100644
--- a/examples/10_planar_complex/planar_complex.cu
+++ b/examples/10_planar_complex/planar_complex.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -167,15 +173,15 @@ struct Options {
       << "  This example uses the CUTLASS Library to execute Planar Complex GEMM computations.\n\n"
       << "Options:\n\n"
       << "  --help                      If specified, displays this usage statement.\n\n"
-      << "  --m <int>                   GEMM M dimension\n"
-      << "  --n <int>                   GEMM N dimension\n"
-      << "  --k <int>                   GEMM K dimension\n"
-      << "  --batch <int>               Number of GEMM operations executed in one batch\n"
-      << "  --alpha <f32>               Epilogue scalar alpha (real part)\n"
-      << "  --alpha_i <f32>             Epilogue scalar alpha (imaginary part)\n"
-      << "  --beta <f32>                Epilogue scalar beta (real part)\n\n"
-      << "  --beta_i <f32>              Epilogue scalar beta (imaginary part)\n\n"
-      << "  --iterations <int>          Number of profiling iterations to perform.\n\n";
+      << "  --m=<int>                   GEMM M dimension\n"
+      << "  --n=<int>                   GEMM N dimension\n"
+      << "  --k=<int>                   GEMM K dimension\n"
+      << "  --batch=<int>               Number of GEMM operations executed in one batch\n"
+      << "  --alpha=<f32>               Epilogue scalar alpha (real part)\n"
+      << "  --alpha_i=<f32>             Epilogue scalar alpha (imaginary part)\n"
+      << "  --beta=<f32>                Epilogue scalar beta (real part)\n\n"
+      << "  --beta_i=<f32>              Epilogue scalar beta (imaginary part)\n\n"
+      << "  --iterations=<int>          Number of profiling iterations to perform.\n\n";
 
     out << "\n\nExamples:\n\n"
       << "$ ./examples/10_planar_complex/10_planar_complex  --batch=7 --m=1024 --n=512 --k=1024 \\\n"
diff --git a/examples/11_planar_complex_array/CMakeLists.txt b/examples/11_planar_complex_array/CMakeLists.txt
index 33325790..b5ad07cf 100644
--- a/examples/11_planar_complex_array/CMakeLists.txt
+++ b/examples/11_planar_complex_array/CMakeLists.txt
@@ -1,26 +1,34 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
+
 # Planar Complex Array GEMM example
 cutlass_example_add_executable(
   11_planar_complex_array
diff --git a/examples/11_planar_complex_array/planar_complex_array.cu b/examples/11_planar_complex_array/planar_complex_array.cu
index 1b321884..1dd35846 100644
--- a/examples/11_planar_complex_array/planar_complex_array.cu
+++ b/examples/11_planar_complex_array/planar_complex_array.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -165,15 +171,15 @@ struct Options {
       << "  This example uses the CUTLASS Library to execute Planar Complex Array GEMM computations.\n\n"
       << "Options:\n\n"
       << "  --help                      If specified, displays this usage statement.\n\n"
-      << "  --m <int>                   GEMM M dimension\n"
-      << "  --n <int>                   GEMM N dimension\n"
-      << "  --k <int>                   GEMM K dimension\n"
-      << "  --batch <int>               Number of GEMM operations executed in one batch\n"
-      << "  --alpha <f32>               Epilogue scalar alpha (real part)\n"
-      << "  --alpha_i <f32>             Epilogue scalar alpha (imaginary part)\n"
-      << "  --beta <f32>                Epilogue scalar beta (real part)\n\n"
-      << "  --beta_i <f32>              Epilogue scalar beta (imaginary part)\n\n"
-      << "  --iterations <int>          Number of profiling iterations to perform.\n";
+      << "  --m=<int>                   GEMM M dimension\n"
+      << "  --n=<int>                   GEMM N dimension\n"
+      << "  --k=<int>                   GEMM K dimension\n"
+      << "  --batch=<int>               Number of GEMM operations executed in one batch\n"
+      << "  --alpha=<f32>               Epilogue scalar alpha (real part)\n"
+      << "  --alpha_i=<f32>             Epilogue scalar alpha (imaginary part)\n"
+      << "  --beta=<f32>                Epilogue scalar beta (real part)\n\n"
+      << "  --beta_i=<f32>              Epilogue scalar beta (imaginary part)\n\n"
+      << "  --iterations=<int>          Number of profiling iterations to perform.\n";
 
     out << "\n\nExamples:\n\n"
       << "$ ./examples/11_planar_complex_array/11_planar_complex_array\n\n";
diff --git a/examples/12_gemm_bias_relu/CMakeLists.txt b/examples/12_gemm_bias_relu/CMakeLists.txt
index e298aece..3b681b31 100644
--- a/examples/12_gemm_bias_relu/CMakeLists.txt
+++ b/examples/12_gemm_bias_relu/CMakeLists.txt
@@ -1,25 +1,33 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+
 cutlass_example_add_executable(
   12_gemm_bias_relu
   gemm_bias_relu.cu
diff --git a/examples/12_gemm_bias_relu/gemm_bias_relu.cu b/examples/12_gemm_bias_relu/gemm_bias_relu.cu
index cc904139..43808354 100644
--- a/examples/12_gemm_bias_relu/gemm_bias_relu.cu
+++ b/examples/12_gemm_bias_relu/gemm_bias_relu.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -294,4 +300,3 @@ int main() {
 
   return run();
 }
-  
diff --git a/examples/13_two_tensor_op_fusion/CMakeLists.txt b/examples/13_two_tensor_op_fusion/CMakeLists.txt
index 2963ad25..04d55bbe 100644
--- a/examples/13_two_tensor_op_fusion/CMakeLists.txt
+++ b/examples/13_two_tensor_op_fusion/CMakeLists.txt
@@ -1,45 +1,82 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-cutlass_example_add_executable(
-  13_fused_two_gemms
-  fused_gemm.cu
-  )
 
-cutlass_example_add_executable(
-  13_fused_two_convs
-  fused_conv2d.cu
-  )
+include_directories(
+  .
+)
 
+add_custom_target(13_fused_two_gemms)
 
-target_include_directories(
-  13_fused_two_gemms
-  PRIVATE
-  .
+add_custom_target(13_fused_two_convs)
+
+add_custom_target(13_two_tensor_op_fusion
+DEPENDS 13_fused_two_gemms
+        13_fused_two_convs
+)
+
+foreach(FUSION_CONV_EXAMPLE
+  fused_two_convs_f16_sm75_rf
+  fused_two_convs_f16_sm75_shmem
+  fused_two_convs_f16_sm80_rf
+  fused_two_convs_f16_sm80_shmem
+  fused_two_convs_s8_sm75_rf
+  fused_two_convs_s8_sm75_shmem
+  fused_two_convs_s8_sm80_rf
+  fused_two_convs_s8_sm80_shmem
+)
+
+  cutlass_example_add_executable(
+    13_${FUSION_CONV_EXAMPLE}
+    ${FUSION_CONV_EXAMPLE}.cu
   )
 
-target_include_directories(
-  13_fused_two_convs
-  PRIVATE
-  .
+  add_dependencies(13_fused_two_convs 13_${FUSION_CONV_EXAMPLE})
+
+endforeach()
+
+foreach(FUSION_GEMM_EXAMPLE
+  fused_two_gemms_f16_sm75_rf
+  fused_two_gemms_f16_sm75_shmem
+  fused_two_gemms_f16_sm80_rf
+  fused_two_gemms_f16_sm80_shmem
+  fused_two_gemms_s8_sm75_rf
+  fused_two_gemms_s8_sm75_shmem
+  fused_two_gemms_s8_sm80_rf
+  fused_two_gemms_s8_sm80_shmem
+)
+  cutlass_example_add_executable(
+    13_${FUSION_GEMM_EXAMPLE}
+    ${FUSION_GEMM_EXAMPLE}.cu
   )
 
+  add_dependencies(13_fused_two_gemms 13_${FUSION_GEMM_EXAMPLE})
+
+endforeach()
+
diff --git a/examples/13_two_tensor_op_fusion/README.md b/examples/13_two_tensor_op_fusion/README.md
index 76599326..4ff19e2c 100644
--- a/examples/13_two_tensor_op_fusion/README.md
+++ b/examples/13_two_tensor_op_fusion/README.md
@@ -48,33 +48,48 @@ addition to its own input activation tile. Therefore the input activation warp t
 2nd GEMM/Conv only depends on the output warp accumulator of the 1st GEMM/Conv in the
 register file, and the operation can be fully register-file-resident.
 
+On the other hand, this constraint can be relaxed if the output accumulator of the 1st GEMM/CONV
+is staged in the shared memory and then used as input for the 2nd GEMM/CONV. In this case, the
+input of each warp tile can be loaded from the shared memory so they do not need to be RF-resident,
+therefore each warp does not need to store the entire input matrix of 2nd GEMM in its RF. This is
+illustrated in the diagram below.
+
+<p align="center"><img src=/media/images/13_example_shmem_resident_fusion.png></p>
+
+
 When applying the above constraint to convolutions, it is required that the 2nd Convolution
 kernel doesn't have halos such that data used by each threadblock doesn't depend on any other
 threadblock. Typically this requires the 2nd Convolution uses 1x1 filter without any paddings.
 
 # Copyright
 
-Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
 
 ```
-  Redistribution and use in source and binary forms, with or without modification, are permitted
-  provided that the following conditions are met:
-      * Redistributions of source code must retain the above copyright notice, this list of
-        conditions and the following disclaimer.
-      * Redistributions in binary form must reproduce the above copyright notice, this list of
-        conditions and the following disclaimer in the documentation and/or other materials
-        provided with the distribution.
-      * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-        to endorse or promote products derived from this software without specific prior written
-        permission.
-
-  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-  IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-  FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-  OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-  STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+  1. Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+  2. Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+  3. Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ```
 
diff --git a/examples/13_two_tensor_op_fusion/b2b_conv2d_run.h b/examples/13_two_tensor_op_fusion/b2b_conv2d_run.h
index badcd25e..7fa1a28b 100644
--- a/examples/13_two_tensor_op_fusion/b2b_conv2d_run.h
+++ b/examples/13_two_tensor_op_fusion/b2b_conv2d_run.h
@@ -1,30 +1,34 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
-/*! \file
-    \brief Implicit GEMM testbed
-*/
+
 #pragma once
 
 #include <iostream>
@@ -275,7 +279,7 @@ class B2bNonFusedConv2dRun {
     cudaEventElapsedTime(&totalTime, start, stop2);
     std::cout << "conv2d 0 time " << conv2d0Time / (float)runs << " ms\n";
     std::cout << "conv2d 1 time " << conv2d1Time / (float)runs << " ms\n";
-    std::cout << "total time " << totalTime / (float)runs << " ms\n";
+    std::cout << "Non-fusion time " << totalTime / (float)runs << " ms\n";
 
     tensor_D0_computed.sync_host();
     tensor_D1_computed.sync_host();
@@ -592,7 +596,7 @@ class B2bFusedConv2dRun {
     cudaDeviceSynchronize();
     float conv2dTime;
     cudaEventElapsedTime(&conv2dTime, start, stop);
-    std::cout << "time " << conv2dTime / (float)runs << " ms\n";
+    std::cout << "Fusion time " << conv2dTime / (float)runs << " ms\n";
 
     tensor_D1_computed.sync_host();
     
diff --git a/examples/13_two_tensor_op_fusion/b2b_gemm_run.h b/examples/13_two_tensor_op_fusion/b2b_gemm_run.h
index cf34460e..bebc058f 100644
--- a/examples/13_two_tensor_op_fusion/b2b_gemm_run.h
+++ b/examples/13_two_tensor_op_fusion/b2b_gemm_run.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -264,7 +270,7 @@ struct B2bNonFusedGemmRun
     cudaEventElapsedTime(&totalTime, start, stop2);
     std::cout << "gemm 0 time " << gemm0Time / (float)runs << " ms\n";
     std::cout << "gemm 1 time " << gemm1Time / (float)runs << " ms\n";
-    std::cout << "total time " << totalTime / (float)runs << " ms\n";
+    std::cout << "Non-fusion time " << totalTime / (float)runs << " ms\n";
 
     tensor_D0.sync_host();
     tensor_D1.sync_host();
@@ -507,7 +513,19 @@ struct B2bFusedGemmRun
 
     B2bGemm b2b_gemm_op;
 
-    cutlass::Status status = b2b_gemm_op.initialize(arguments);
+    cutlass::Status status = b2b_gemm_op.can_implement(arguments);
+
+    if(status != cutlass::Status::kSuccess) {
+        std::cout << "Problem sizes not supported.\n"
+                << "Requirments:\n"
+                << "    problem_size_0.M = problem_size_1.M\n"
+                << "    problem_size_0.N = problem_size_1.K\n"
+                << "    ThreadblockShape0::kN = problem_size_0.N\n"
+                << "    ThreadblockShape1::kN = problem_size_1.N" << std::endl;
+    }
+
+
+    status = b2b_gemm_op.initialize(arguments);
 
     CUTLASS_CHECK(status);
 
@@ -536,7 +554,7 @@ struct B2bFusedGemmRun
     cudaDeviceSynchronize();
     float gemmTime;
     cudaEventElapsedTime(&gemmTime, start, stop);
-    std::cout << "time " << gemmTime / (float)runs << " ms\n";
+    std::cout << "Fusion time " << gemmTime / (float)runs << " ms\n";
 
     tensor_D1.sync_host();
 
diff --git a/examples/13_two_tensor_op_fusion/b2b_interleaved_conv2d_run.h b/examples/13_two_tensor_op_fusion/b2b_interleaved_conv2d_run.h
index 59521e49..fbdcc22b 100644
--- a/examples/13_two_tensor_op_fusion/b2b_interleaved_conv2d_run.h
+++ b/examples/13_two_tensor_op_fusion/b2b_interleaved_conv2d_run.h
@@ -1,30 +1,34 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
-/*! \file
-    \brief Implicit GEMM testbed
-*/
+
 #pragma once
 
 #include <iostream>
@@ -286,7 +290,7 @@ class B2bInterleavedNonFusedConv2dRun {
     cudaEventElapsedTime(&totalTime, start, stop2);
     std::cout << "conv2d 0 time " << conv2d0Time / (float)runs << " ms\n";
     std::cout << "conv2d 1 time " << conv2d1Time / (float)runs << " ms\n";
-    std::cout << "total time " << totalTime / (float)runs << " ms\n";
+    std::cout << "Non-fusion time " << totalTime / (float)runs << " ms\n";
 
     tensor_D0_computed.sync_host();
     tensor_D1_computed.sync_host();
@@ -617,7 +621,7 @@ class B2bInterleavedFusedConv2dRun {
     cudaDeviceSynchronize();
     float conv2dTime;
     cudaEventElapsedTime(&conv2dTime, start, stop);
-    std::cout << "time " << conv2dTime / (float)runs << " ms\n";
+    std::cout << "Fusion time " << conv2dTime / (float)runs << " ms\n";
 
     tensor_D1_computed.sync_host();
     
diff --git a/examples/13_two_tensor_op_fusion/b2b_interleaved_gemm_run.h b/examples/13_two_tensor_op_fusion/b2b_interleaved_gemm_run.h
index f2785e68..693e252e 100644
--- a/examples/13_two_tensor_op_fusion/b2b_interleaved_gemm_run.h
+++ b/examples/13_two_tensor_op_fusion/b2b_interleaved_gemm_run.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -275,11 +281,12 @@ struct B2bInterleavedNonFusedGemmRun
     cudaEventElapsedTime(&totalTime, start, stop2);
     std::cout << "gemm 0 time " << gemm0Time / (float)runs << " ms\n";
     std::cout << "gemm 1 time " << gemm1Time / (float)runs << " ms\n";
-    std::cout << "total time " << totalTime / (float)runs << " ms\n";
+    std::cout << "Non-fusion time " << totalTime / (float)runs << " ms\n";
 
     tensor_D0.sync_host();
     tensor_D1.sync_host();
 
+    bool passed = false;
     //
     // Verify
     //
@@ -334,7 +341,7 @@ struct B2bInterleavedNonFusedGemmRun
     CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1.host_view()), 0);
     CHECK_GT(cutlass::reference::host::TensorNorm(reference_D1.host_view()), 0);
 
-    bool passed = cutlass::reference::host::TensorEquals(
+    passed = cutlass::reference::host::TensorEquals(
       reference_D1.host_view(), 
       tensor_D1.host_view());
 
@@ -360,7 +367,6 @@ struct B2bInterleavedNonFusedGemmRun
         << "\n\nReference =\n" << reference_D1.host_view()
         << "\nComputed =\n" << tensor_D1.host_view();
     }
-
     return passed;
   }
 };
@@ -531,7 +537,18 @@ struct B2bInterleavedFusedGemmRun
 
     B2bGemm b2b_gemm_op;
 
-    cutlass::Status status = b2b_gemm_op.initialize(arguments);
+    cutlass::Status status = b2b_gemm_op.can_implement(arguments);
+
+    if(status != cutlass::Status::kSuccess) {
+        std::cout << "Problem sizes not supported.\n"
+                << "Requirments:\n"
+                << "    problem_size_0.M = problem_size_1.M\n"
+                << "    problem_size_0.N = problem_size_1.K\n"
+                << "    ThreadblockShape0::kN = problem_size_0.N\n"
+                << "    ThreadblockShape1::kN = problem_size_1.N" << std::endl;
+    }
+
+    status = b2b_gemm_op.initialize(arguments);
 
     CUTLASS_CHECK(status);
 
@@ -560,10 +577,11 @@ struct B2bInterleavedFusedGemmRun
     cudaDeviceSynchronize();
     float gemmTime;
     cudaEventElapsedTime(&gemmTime, start, stop);
-    std::cout << "time " << gemmTime / (float)runs << " ms\n";
+    std::cout << "Fusion time " << gemmTime / (float)runs << " ms\n";
 
     tensor_D1.sync_host();
 
+    bool passed = false;
     //
     // Verify
     //
@@ -611,7 +629,7 @@ struct B2bInterleavedFusedGemmRun
     CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1.host_view()), 0);
     CHECK_GT(cutlass::reference::host::TensorNorm(reference_D1.host_view()), 0);
 
-    bool passed = cutlass::reference::host::TensorEquals(
+    passed = cutlass::reference::host::TensorEquals(
       reference_D1.host_view(), 
       tensor_D1.host_view());
 
@@ -636,7 +654,6 @@ struct B2bInterleavedFusedGemmRun
         << "\n\nReference =\n" << reference_D1.host_view()
         << "\nComputed =\n" << tensor_D1.host_view();
     }
-
     return passed;
   }
 
diff --git a/examples/13_two_tensor_op_fusion/device/b2b_gemm.h b/examples/13_two_tensor_op_fusion/device/b2b_gemm.h
index 564a440a..54b58d3e 100644
--- a/examples/13_two_tensor_op_fusion/device/b2b_gemm.h
+++ b/examples/13_two_tensor_op_fusion/device/b2b_gemm.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -40,6 +46,7 @@
 
 #include "kernel/b2b_gemm.h"
 #include "kernel/default_b2b_gemm.h"
+#include "kernel/default_b2b_gemm_smem_accumulator.h"
 
 ////////////////////////////////////////////////////////////////////////////////
 
@@ -102,6 +109,8 @@ template <
     int Stages =
         DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
                                  ElementC_, ElementAccumulator_>::kStages,
+    /// Stage accumulator in shared memory
+    bool SmemAccumulator = false,
     /// Access granularity of A matrix in units of elements
     int AlignmentA =
         DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
@@ -172,7 +181,8 @@ class B2bGemm {
     ThreadblockSwizzle,
     kStages,
     kSplitKSerial,
-    Operator
+    Operator,
+    SmemAccumulator
   >::B2bGemmKernel;
 
   /// Argument structure
diff --git a/examples/13_two_tensor_op_fusion/device/b2b_implicit_gemm_convolution.h b/examples/13_two_tensor_op_fusion/device/b2b_implicit_gemm_convolution.h
index e078aab5..7dd8fe28 100644
--- a/examples/13_two_tensor_op_fusion/device/b2b_implicit_gemm_convolution.h
+++ b/examples/13_two_tensor_op_fusion/device/b2b_implicit_gemm_convolution.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -36,6 +42,10 @@
 
 #include "kernel/b2b_implicit_gemm_convolution.h"
 #include "kernel/default_b2b_conv2d_fprop.h"
+#include "kernel/default_b2b_conv2d_fprop_sm75.h"
+#include "kernel/default_b2b_conv2d_fprop_sm80.h"
+#include "kernel/default_b2b_conv2d_fprop_smem_accumulator_sm75.h"
+#include "kernel/default_b2b_conv2d_fprop_smem_accumulator_sm80.h"
 
 namespace cutlass {
 namespace conv {
diff --git a/examples/13_two_tensor_op_fusion/fused_conv2d.cu b/examples/13_two_tensor_op_fusion/fused_conv2d.cu
deleted file mode 100644
index 05bee70c..00000000
--- a/examples/13_two_tensor_op_fusion/fused_conv2d.cu
+++ /dev/null
@@ -1,138 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-#include "b2b_conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.h"
-#include "b2b_conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm80.h"
-#include "b2b_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm75.h"
-#include "b2b_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.h"
-
-int run_sm75() {
-  bool notSupported = false;
-
-  // Turing Tensor Core operations exposed with mma.sync are first available in CUDA 10.2.
-  //
-  // CUTLASS must be compiled with CUDA 10.2 Toolkit to run these examples.
-  if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) {
-    notSupported = true;
-  }
-
-  cudaDeviceProp props;
-
-  cudaError_t error = cudaGetDeviceProperties(&props, 0);
-  if (error != cudaSuccess) {
-    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
-    return -1;
-  }
-
-  if (!(props.major == 7 && props.minor >= 5)) {
-    notSupported = true;
-  }
-
-  if (notSupported) {
-    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
-    return 0;
-  }
-
-  bool pass = 1;
- 
-  std::cout << "Running on SM75" << std::endl;
-  pass &= run_nonfused_conv2d_fprop_optimized_f16_sm75();
-  pass &= run_fused_conv2d_fprop_optimized_f16_sm75();
-  pass &= run_fused_conv2d_fprop_optimized_f16_sm75_rf_res();
-  pass &= run_nonfused_conv2d_fprop_optimized_s8_sm75();
-  pass &= run_fused_conv2d_fprop_optimized_s8_sm75();
-  pass &= run_fused_conv2d_fprop_optimized_s8_sm75_rf_res();
-
-  if(pass)
-    return 1;
-  else
-    return -1;
-
-}
-
-int run_sm80() {
-  bool notSupported = false;
-
-  // Ampere Tensor Core operations exposed with mma.sync are first available in CUDA 11.0.
-  //
-  // CUTLASS must be compiled with CUDA 11 Toolkit to run Conv2dFprop examples.
-  if (!(__CUDACC_VER_MAJOR__ > 11 || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 0))) {
-    notSupported = true;
-  }
-
-  cudaDeviceProp props;
-
-  cudaError_t error = cudaGetDeviceProperties(&props, 0);
-  if (error != cudaSuccess) {
-    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
-    return -1;
-  }
-
-  if (!(props.major == 8 && props.minor >= 0)) {
-    notSupported = true;
-  }
-
-  if (notSupported) {
-    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
-    return 0;
-  }
-
-  bool pass = 1;
- 
-  std::cout << "Running on SM80" << std::endl;
-  pass &= run_nonfused_conv2d_fprop_optimized_f16_sm80();
-  pass &= run_fused_conv2d_fprop_optimized_f16_sm80();
-  pass &= run_nonfused_conv2d_fprop_optimized_s8_sm80();
-  pass &= run_fused_conv2d_fprop_optimized_s8_sm80();
-
-  if(pass)
-    return 1;
-  else
-    return -1;
-
-}
-
-
-int main() {
-
-  int result = 0;
-
-  result = run_sm80();
-
-  if(!result) { // not supported
-    result = run_sm75();
-
-    if(!result) {
-      std::cout << "This example isn't supported on current architecture" << std::endl;
-    }
-
-  }
-
-  if(result >= 0)
-    return 0;
-  else
-    return -1;
-}
-
diff --git a/examples/13_two_tensor_op_fusion/fused_gemm.cu b/examples/13_two_tensor_op_fusion/fused_gemm.cu
deleted file mode 100644
index 5f41c4ce..00000000
--- a/examples/13_two_tensor_op_fusion/fused_gemm.cu
+++ /dev/null
@@ -1,141 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-#include "b2b_gemm_f16t_f16n_f16t_tensor_op_f16_sm75.h"
-#include "b2b_gemm_f16t_f16n_f16t_tensor_op_f16_sm80.h"
-#include "b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm75.h"
-#include "b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm80.h"
-
-int run_sm75() {
-  bool notSupported = false;
-
-  // Turing Tensor Core operations exposed with mma.sync are first available in CUDA 10.2.
-  //
-  // CUTLASS must be compiled with CUDA 10.2 Toolkit to run these examples.
-  if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) {
-    notSupported = true;
-
-  }
-
-  cudaDeviceProp props;
-
-  cudaError_t error = cudaGetDeviceProperties(&props, 0);
-  if (error != cudaSuccess) {
-    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
-    return -1;
-  }
-
-  if (!(props.major == 7 && props.minor >= 5)) {
-    notSupported = true;
-  }
-
-  if (notSupported) {
-    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
-    return 0;
-  }
-
-  bool pass = true;
- 
-  std::cout << "Running on SM75" << std::endl;
-  pass &= run_nonfused_gemm_f16();
-  pass &= run_fused_gemm_f16();
-  pass &= run_nonfused_gemm_s8();
-  pass &= run_fused_gemm_s8();
-
-  if(pass)
-    return 1;
-  else
-    return -1;
-    
-
-}
-
-int run_sm80() {
-  bool notSupported = false;
-
-  // Ampere Tensor Core operations exposed with mma.sync are first available in CUDA 11.0.
-  //
-  // CUTLASS must be compiled with CUDA 11 Toolkit to run Conv2dFprop examples.
-  if (!(__CUDACC_VER_MAJOR__ > 11 || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 0))) {
-    notSupported = true;
-
-  }
-
-  cudaDeviceProp props;
-
-  cudaError_t error = cudaGetDeviceProperties(&props, 0);
-  if (error != cudaSuccess) {
-    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
-    return -1;
-  }
-
-  if (!(props.major == 8 && props.minor >= 0)) {
-    notSupported = true;
-  }
-
-  if (notSupported) {
-    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
-    return 0;
-  }
-
-  bool pass = true;
- 
-  std::cout << "Running on SM80" << std::endl;
-  pass &= run_nonfused_gemm_f16_sm80();
-  pass &= run_fused_gemm_f16_sm80();
-  pass &= run_nonfused_gemm_s8_sm80();
-  pass &= run_fused_gemm_s8_sm80();
-
-  if(pass)
-    return 1;
-  else
-    return -1;
-
-}
-
-
-int main() {
-
-  int result = 0;
-
-  result = run_sm80();
-
-  if(!result) { // not supported
-    result = run_sm75();
-
-    if(!result) {
-      std::cout << "This example isn't supported on current architecture" << std::endl;
-    }
-
-  }
-
-  if(result >= 0)
-    return 0;
-  else
-    return -1;
-}
-
-
-
diff --git a/examples/13_two_tensor_op_fusion/b2b_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm75.h b/examples/13_two_tensor_op_fusion/fused_two_convs_f16_sm75_rf.cu
similarity index 60%
rename from examples/13_two_tensor_op_fusion/b2b_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm75.h
rename to examples/13_two_tensor_op_fusion/fused_two_convs_f16_sm75_rf.cu
index 42edd6aa..e5b0dd1f 100644
--- a/examples/13_two_tensor_op_fusion/b2b_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm75.h
+++ b/examples/13_two_tensor_op_fusion/fused_two_convs_f16_sm75_rf.cu
@@ -1,30 +1,33 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
-/*! \file
-    \brief Tests for device-wide GEMM interface
-*/
 
 #include <iostream>
 
@@ -35,24 +38,25 @@
 
 #include "device/b2b_implicit_gemm_convolution.h"
 #include "b2b_conv2d_run.h"
+#include "test_run.h"
 
 ////////////////////////////////////////////////////////////////////////////////
 
 cutlass::conv::Conv2dProblemSize conv2d_f16_sm75_problem_size_0 (
-    {128, 56, 56, 64},    // input size (NHWC)
+    {32, 56, 56, 64},    // input size (NHWC)
     {64, 3, 3, 64},   // filter size (KRSC)
     {1, 1, 1, 1},     // padding (pad_h, _, pad_w, _)
     {1, 1},           // stride (stride_h, stride_w)
     {1, 1},           // dilation (dilation_h, dilation_w)
-    {128, 56, 56, 64}     // output size (NPQK)
+    {32, 56, 56, 64}     // output size (NPQK)
   );
 cutlass::conv::Conv2dProblemSize conv2d_f16_sm75_problem_size_1 (
-    {128, 56, 56, 64},    // input size (NHWC)
-    {256, 1, 1, 64},   // filter size (KRSC)
+    {32, 56, 56, 64},    // input size (NHWC)
+    {128, 1, 1, 64},   // filter size (KRSC)
     {0, 0, 0, 0},     // padding (pad_h, _, pad_w, _)
     {1, 1},           // stride (stride_h, stride_w)
     {1, 1},           // dilation (dilation_h, dilation_w)
-    {128, 56, 56, 256}     // output size (NPQK)
+    {32, 56, 56, 128}     // output size (NPQK)
   );
 
 bool run_nonfused_conv2d_fprop_optimized_f16_sm75() {
@@ -64,13 +68,13 @@ bool run_nonfused_conv2d_fprop_optimized_f16_sm75() {
   using ElementCompute     = cutlass::half_t;
 
   ElementCompute alpha0 = ElementCompute(1);
-  ElementCompute beta0 = ElementCompute(1); //use beta for bias
+  ElementCompute beta0 = ElementCompute(0);
   ElementCompute alpha1 = ElementCompute(1);
   ElementCompute beta1 = ElementCompute(1); //use beta for bias
 
   using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 32>;
   using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 32>;
-  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 32>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 128, 32>;
   using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 32>;
   using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
 
@@ -89,7 +93,7 @@ bool run_nonfused_conv2d_fprop_optimized_f16_sm75() {
       128 / cutlass::sizeof_bits<ElementC>::value,
       ElementAccumulator,
       ElementCompute,
-      cutlass::epilogue::thread::ScaleType::NoBetaScaling
+      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
     >,
     cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
     2,
@@ -138,83 +142,6 @@ bool run_nonfused_conv2d_fprop_optimized_f16_sm75() {
   return pass;
 }
 
-bool run_fused_conv2d_fprop_optimized_f16_sm75() {
-
-  using ElementA           = cutlass::half_t; 
-  using ElementB           = cutlass::half_t; 
-  using ElementC           = cutlass::half_t; 
-  using ElementAccumulator = cutlass::half_t; 
-  using ElementCompute     = cutlass::half_t; 
-
-  ElementCompute alpha0 = ElementCompute(1);
-  ElementCompute beta0 = ElementCompute(0); 
-  ElementCompute alpha1 = ElementCompute(1);
-  ElementCompute beta1 = ElementCompute(1); //use beta for bias
-
-  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 32>;
-  using WarpShape0 = cutlass::gemm::GemmShape<32, 32, 32>;
-  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 256, 32>;
-  using WarpShape1 = cutlass::gemm::GemmShape<64, 64, 32>;
-  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
-
-  using EpilogueOutputOp0 = 
-    cutlass::epilogue::thread::LinearCombinationRelu<
-      ElementC,
-      InstructionShape::kM * InstructionShape::kN / 32,
-      ElementAccumulator,
-      ElementCompute,
-      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
-    >;
-
-  using EpilogueOutputOp1 = 
-    cutlass::epilogue::thread::LinearCombinationRelu<
-      ElementC,
-      128 / cutlass::sizeof_bits<ElementC>::value,
-      ElementAccumulator,
-      ElementCompute,
-      cutlass::epilogue::thread::ScaleType::NoBetaScaling
-    >;
-
-
-  const bool SmemAccumulator = true;
-
-  using B2bConv2dFpropKernel = typename cutlass::conv::kernel::DefaultB2bConv2dFprop<
-    ElementA, cutlass::layout::TensorNHWC,
-    ElementB, cutlass::layout::TensorNHWC,
-    ElementC, cutlass::layout::TensorNHWC,
-    ElementAccumulator,
-    cutlass::arch::OpClassTensorOp,
-    cutlass::arch::Sm75,
-    ThreadblockShape0,
-    ThreadblockShape1,
-    WarpShape0,
-    WarpShape1,
-    InstructionShape,
-    EpilogueOutputOp0,
-    EpilogueOutputOp1,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
-    2,
-    cutlass::arch::OpMultiplyAdd,
-    cutlass::conv::IteratorAlgorithm::kOptimized,
-    SmemAccumulator
-  >::Kernel;
-  
-  using B2bConv2dFprop = cutlass::conv::device::B2bImplicitGemmConvolution<B2bConv2dFpropKernel>;
-
-  B2bFusedConv2dRun<B2bConv2dFprop> fusedConv2d;
-
-  std::cout << "Running Fused back-to-back FP16 Optimized Convolution Fprops with shared memory staging...\n";
-  bool pass = fusedConv2d.run(conv2d_f16_sm75_problem_size_0, conv2d_f16_sm75_problem_size_1, cutlass::conv::SplitKMode::kSerial,
-      alpha0, beta0, alpha1, beta1);
-
-  if(pass)
-    std::cout << "Pass\n";
-  else
-    std::cout << "Fail\n";
-
-  return pass;
-}
-
 bool run_fused_conv2d_fprop_optimized_f16_sm75_rf_res() {
 
   using ElementA           = cutlass::half_t; 
@@ -230,8 +157,8 @@ bool run_fused_conv2d_fprop_optimized_f16_sm75_rf_res() {
 
   using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 32>;
   using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 32>;
-  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 256, 32>;
-  using WarpShape1 = cutlass::gemm::GemmShape<32, 256, 32>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 128, 32>;
+  using WarpShape1 = cutlass::gemm::GemmShape<32, 128, 32>;
   using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
 
   using EpilogueOutputOp0 = 
@@ -292,5 +219,15 @@ bool run_fused_conv2d_fprop_optimized_f16_sm75_rf_res() {
   return pass;
 }
 
+int main() {
+
+  std::vector<bool (*)()>funcs = {
+    &run_nonfused_conv2d_fprop_optimized_f16_sm75,
+    &run_fused_conv2d_fprop_optimized_f16_sm75_rf_res
+  };
+
+  return testRun(75, funcs, "conv f16 RF residency");
+
+}
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/13_two_tensor_op_fusion/fused_two_convs_f16_sm75_shmem.cu b/examples/13_two_tensor_op_fusion/fused_two_convs_f16_sm75_shmem.cu
new file mode 100644
index 00000000..e549ccc3
--- /dev/null
+++ b/examples/13_two_tensor_op_fusion/fused_two_convs_f16_sm75_shmem.cu
@@ -0,0 +1,233 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "device/b2b_implicit_gemm_convolution.h"
+#include "b2b_conv2d_run.h"
+#include "test_run.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+cutlass::conv::Conv2dProblemSize conv2d_f16_sm75_problem_size_0 (
+    {32, 56, 56, 64},    // input size (NHWC)
+    {64, 3, 3, 64},   // filter size (KRSC)
+    {1, 1, 1, 1},     // padding (pad_h, _, pad_w, _)
+    {1, 1},           // stride (stride_h, stride_w)
+    {1, 1},           // dilation (dilation_h, dilation_w)
+    {32, 56, 56, 64}     // output size (NPQK)
+  );
+cutlass::conv::Conv2dProblemSize conv2d_f16_sm75_problem_size_1 (
+    {32, 56, 56, 64},    // input size (NHWC)
+    {256, 1, 1, 64},   // filter size (KRSC)
+    {0, 0, 0, 0},     // padding (pad_h, _, pad_w, _)
+    {1, 1},           // stride (stride_h, stride_w)
+    {1, 1},           // dilation (dilation_h, dilation_w)
+    {32, 56, 56, 256}     // output size (NPQK)
+  );
+
+bool run_nonfused_conv2d_fprop_optimized_f16_sm75() {
+
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+  using ElementCompute     = cutlass::half_t;
+
+  ElementCompute alpha0 = ElementCompute(1);
+  ElementCompute beta0 = ElementCompute(0);
+  ElementCompute alpha1 = ElementCompute(1);
+  ElementCompute beta1 = ElementCompute(0);
+
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 32>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 32, 32>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 256, 32>;
+  using WarpShape1 = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+
+  using Conv2dFpropKernel0 = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    ThreadblockShape0,
+    WarpShape0,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute,
+      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop0 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel0>;
+
+  using Conv2dFpropKernel1 = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    ThreadblockShape1,
+    WarpShape1,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute,
+      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop1 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel1>;
+
+  B2bNonFusedConv2dRun<Conv2dFprop0, Conv2dFprop1> nonFusedConv2d;
+
+  std::cout << "Running Non-fused back-to-back FP16 Optimized Convolution Fprops...\n";
+  bool pass = nonFusedConv2d.run(conv2d_f16_sm75_problem_size_0, conv2d_f16_sm75_problem_size_1, cutlass::conv::SplitKMode::kSerial,
+      alpha0, beta0, alpha1, beta1);
+
+  if(pass)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+
+  return pass;
+}
+
+bool run_fused_conv2d_fprop_optimized_f16_sm75_shmem() {
+
+  using ElementA           = cutlass::half_t; 
+  using ElementB           = cutlass::half_t; 
+  using ElementC           = cutlass::half_t; 
+  using ElementAccumulator = cutlass::half_t; 
+  using ElementCompute     = cutlass::half_t; 
+
+  ElementCompute alpha0 = ElementCompute(1);
+  ElementCompute beta0 = ElementCompute(0); 
+  ElementCompute alpha1 = ElementCompute(1);
+  ElementCompute beta1 = ElementCompute(0);
+
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 32>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 32, 32>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 256, 32>;
+  using WarpShape1 = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+
+  using EpilogueOutputOp0 = 
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      InstructionShape::kM * InstructionShape::kN / 32,
+      ElementAccumulator,
+      ElementCompute,
+      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
+    >;
+
+  using EpilogueOutputOp1 = 
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute,
+      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
+    >;
+
+
+  const bool SmemAccumulator = true;
+
+  using B2bConv2dFpropKernel = typename cutlass::conv::kernel::DefaultB2bConv2dFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    ThreadblockShape0,
+    ThreadblockShape1,
+    WarpShape0,
+    WarpShape1,
+    InstructionShape,
+    EpilogueOutputOp0,
+    EpilogueOutputOp1,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized,
+    SmemAccumulator
+  >::Kernel;
+  
+  using B2bConv2dFprop = cutlass::conv::device::B2bImplicitGemmConvolution<B2bConv2dFpropKernel>;
+
+  B2bFusedConv2dRun<B2bConv2dFprop> fusedConv2d;
+
+  std::cout << "Running Fused back-to-back FP16 Optimized Convolution Fprops with shared memory staging...\n";
+  bool pass = fusedConv2d.run(conv2d_f16_sm75_problem_size_0, conv2d_f16_sm75_problem_size_1, cutlass::conv::SplitKMode::kSerial,
+      alpha0, beta0, alpha1, beta1);
+
+  if(pass)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+
+  return pass;
+}
+
+int main() {
+
+  std::vector<bool (*)()>funcs = {
+   &run_nonfused_conv2d_fprop_optimized_f16_sm75,
+   &run_fused_conv2d_fprop_optimized_f16_sm75_shmem
+  };
+
+  return testRun(75, funcs, "conv f16 shmem staging");
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/13_two_tensor_op_fusion/b2b_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.h b/examples/13_two_tensor_op_fusion/fused_two_convs_f16_sm80_rf.cu
similarity index 71%
rename from examples/13_two_tensor_op_fusion/b2b_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.h
rename to examples/13_two_tensor_op_fusion/fused_two_convs_f16_sm80_rf.cu
index cdad8512..13a2a9d9 100644
--- a/examples/13_two_tensor_op_fusion/b2b_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.h
+++ b/examples/13_two_tensor_op_fusion/fused_two_convs_f16_sm80_rf.cu
@@ -1,30 +1,33 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
-/*! \file
-    \brief Tests for device-wide GEMM interface
-*/
 
 #include <iostream>
 
@@ -35,24 +38,25 @@
 
 #include "device/b2b_implicit_gemm_convolution.h"
 #include "b2b_conv2d_run.h"
+#include "test_run.h"
 
 ////////////////////////////////////////////////////////////////////////////////
 
 cutlass::conv::Conv2dProblemSize conv2d_f16_sm80_problem_size_0 (
-    {128, 56, 56, 64},    // input size (NHWC)
+    {32, 56, 56, 64},    // input size (NHWC)
     {64, 3, 3, 64},   // filter size (KRSC)
     {1, 1, 1, 1},     // padding (pad_h, _, pad_w, _)
     {1, 1},           // stride (stride_h, stride_w)
     {1, 1},           // dilation (dilation_h, dilation_w)
-    {128, 56, 56, 64}     // output size (NPQK)
+    {32, 56, 56, 64}     // output size (NPQK)
   );
 cutlass::conv::Conv2dProblemSize conv2d_f16_sm80_problem_size_1 (
-    {128, 56, 56, 64},    // input size (NHWC)
-    {64, 1, 1, 64},   // filter size (KRSC)
+    {32, 56, 56, 64},    // input size (NHWC)
+    {128, 1, 1, 64},   // filter size (KRSC)
     {0, 0, 0, 0},     // padding (pad_h, _, pad_w, _)
     {1, 1},           // stride (stride_h, stride_w)
     {1, 1},           // dilation (dilation_h, dilation_w)
-    {128, 56, 56, 64}     // output size (NPQK)
+    {32, 56, 56, 128}     // output size (NPQK)
   );
 
 
@@ -69,10 +73,10 @@ bool run_nonfused_conv2d_fprop_optimized_f16_sm80() {
   ElementCompute alpha1 = ElementCompute(1);
   ElementCompute beta1 = ElementCompute(0);
 
-  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
-  using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>;
-  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 64>;
-  using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 64>;
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 32>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 32, 32>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 128, 32>;
+  using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 32>;
   using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
 
   using Conv2dFpropKernel0 = typename cutlass::conv::kernel::DefaultConv2dFprop<
@@ -138,8 +142,7 @@ bool run_nonfused_conv2d_fprop_optimized_f16_sm80() {
   return pass;
 }
 
-bool run_fused_conv2d_fprop_optimized_f16_sm80() {
-
+bool run_fused_conv2d_fprop_optimized_f16_sm80_rf_res() {
   using ElementA           = cutlass::half_t; 
   using ElementB           = cutlass::half_t; 
   using ElementC           = cutlass::half_t; 
@@ -151,10 +154,10 @@ bool run_fused_conv2d_fprop_optimized_f16_sm80() {
   ElementCompute alpha1 = ElementCompute(1);
   ElementCompute beta1 = ElementCompute(0);
 
-  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
-  using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>;
-  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 64>;
-  using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 64>;
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 32>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 32>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 128, 32>;
+  using WarpShape1 = cutlass::gemm::GemmShape<32, 128, 32>;
   using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
 
   using EpilogueOutputOp0 = 
@@ -198,7 +201,7 @@ bool run_fused_conv2d_fprop_optimized_f16_sm80() {
 
   B2bFusedConv2dRun<B2bConv2dFprop> fusedConv2d;
 
-  std::cout << "Running Fused back-to-back FP16 Optimized Convolution Fprops...\n";
+  std::cout << "Running Fused back-to-back FP16 Optimized Convolution Fprops with RF Residency...\n";
   bool pass = fusedConv2d.run(conv2d_f16_sm80_problem_size_0, conv2d_f16_sm80_problem_size_1, cutlass::conv::SplitKMode::kSerial,
       alpha0, beta0, alpha1, beta1);
 
@@ -208,7 +211,20 @@ bool run_fused_conv2d_fprop_optimized_f16_sm80() {
     std::cout << "Fail\n";
 
   return pass;
+    return true;
+}
+
+int main() {
+
+  std::vector<bool (*)()>funcs = {
+    &run_nonfused_conv2d_fprop_optimized_f16_sm80,
+    &run_fused_conv2d_fprop_optimized_f16_sm80_rf_res
+  };
+
+  return testRun(80, funcs, "conv f16 RF residency");
+
 }
 
+
 ////////////////////////////////////////////////////////////////////////////////
 
diff --git a/examples/13_two_tensor_op_fusion/fused_two_convs_f16_sm80_shmem.cu b/examples/13_two_tensor_op_fusion/fused_two_convs_f16_sm80_shmem.cu
new file mode 100644
index 00000000..deacaa8e
--- /dev/null
+++ b/examples/13_two_tensor_op_fusion/fused_two_convs_f16_sm80_shmem.cu
@@ -0,0 +1,233 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "device/b2b_implicit_gemm_convolution.h"
+#include "b2b_conv2d_run.h"
+#include "test_run.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+cutlass::conv::Conv2dProblemSize conv2d_f16_sm80_problem_size_0 (
+    {32, 56, 56, 64},    // input size (NHWC)
+    {64, 3, 3, 64},   // filter size (KRSC)
+    {1, 1, 1, 1},     // padding (pad_h, _, pad_w, _)
+    {1, 1},           // stride (stride_h, stride_w)
+    {1, 1},           // dilation (dilation_h, dilation_w)
+    {32, 56, 56, 64}     // output size (NPQK)
+  );
+cutlass::conv::Conv2dProblemSize conv2d_f16_sm80_problem_size_1 (
+    {32, 56, 56, 64},    // input size (NHWC)
+    {256, 1, 1, 64},   // filter size (KRSC)
+    {0, 0, 0, 0},     // padding (pad_h, _, pad_w, _)
+    {1, 1},           // stride (stride_h, stride_w)
+    {1, 1},           // dilation (dilation_h, dilation_w)
+    {32, 56, 56, 256}     // output size (NPQK)
+  );
+
+
+bool run_nonfused_conv2d_fprop_optimized_f16_sm80() {
+
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+  using ElementCompute     = cutlass::half_t;
+
+  ElementCompute alpha0 = ElementCompute(1);
+  ElementCompute beta0 = ElementCompute(0);
+  ElementCompute alpha1 = ElementCompute(1);
+  ElementCompute beta1 = ElementCompute(0);
+
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 32>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 32, 32>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 256, 32>;
+  using WarpShape1 = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+
+  using Conv2dFpropKernel0 = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape0,
+    WarpShape0,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute,
+      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    3,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop0 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel0>;
+
+  using Conv2dFpropKernel1 = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape1,
+    WarpShape1,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    3,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop1 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel1>;
+
+  B2bNonFusedConv2dRun<Conv2dFprop0, Conv2dFprop1> nonFusedConv2d;
+
+  std::cout << "Running Non-fused back-to-back FP16 Optimized Convolution Fprops...\n";
+  bool pass = nonFusedConv2d.run(conv2d_f16_sm80_problem_size_0, conv2d_f16_sm80_problem_size_1, cutlass::conv::SplitKMode::kSerial,
+      alpha0, beta0, alpha1, beta1);
+
+  if(pass)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+
+  return pass;
+}
+
+bool run_fused_conv2d_fprop_optimized_f16_sm80_shmem() {
+
+  using ElementA           = cutlass::half_t; 
+  using ElementB           = cutlass::half_t; 
+  using ElementC           = cutlass::half_t; 
+  using ElementAccumulator = cutlass::half_t; 
+  using ElementCompute     = cutlass::half_t; 
+
+  ElementCompute alpha0 = ElementCompute(1);
+  ElementCompute beta0 = ElementCompute(0);
+  ElementCompute alpha1 = ElementCompute(1);
+  ElementCompute beta1 = ElementCompute(0);
+
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 32>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 32, 32>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 256, 32>;
+  using WarpShape1 = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+
+  using EpilogueOutputOp0 = 
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      InstructionShape::kM * InstructionShape::kN / 32,
+      ElementAccumulator,
+      ElementCompute,
+      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
+    >;
+
+  using EpilogueOutputOp1 = 
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >;
+
+  const bool SmemAccumulator = true;
+  using B2bConv2dFpropKernel = typename cutlass::conv::kernel::DefaultB2bConv2dFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape0,
+    ThreadblockShape1,
+    WarpShape0,
+    WarpShape1,
+    InstructionShape,
+    EpilogueOutputOp0,
+    EpilogueOutputOp1,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    3,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized,
+    SmemAccumulator
+  >::Kernel;
+  
+  using B2bConv2dFprop = cutlass::conv::device::B2bImplicitGemmConvolution<B2bConv2dFpropKernel>;
+
+  B2bFusedConv2dRun<B2bConv2dFprop> fusedConv2d;
+
+  std::cout << "Running Fused back-to-back FP16 Optimized Convolution Fprops with shared memory staging...\n";
+  bool pass = fusedConv2d.run(conv2d_f16_sm80_problem_size_0, conv2d_f16_sm80_problem_size_1, cutlass::conv::SplitKMode::kSerial,
+      alpha0, beta0, alpha1, beta1);
+
+  if(pass)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+
+  return pass;
+}
+
+
+int main() {
+
+  std::vector<bool (*)()>funcs = {
+    &run_nonfused_conv2d_fprop_optimized_f16_sm80,
+    &run_fused_conv2d_fprop_optimized_f16_sm80_shmem
+  };
+
+  return testRun(80, funcs, "conv f16 shmem staging");
+
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+
diff --git a/examples/13_two_tensor_op_fusion/b2b_conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.h b/examples/13_two_tensor_op_fusion/fused_two_convs_s8_sm75_rf.cu
similarity index 61%
rename from examples/13_two_tensor_op_fusion/b2b_conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.h
rename to examples/13_two_tensor_op_fusion/fused_two_convs_s8_sm75_rf.cu
index eb0ba622..35f3f094 100644
--- a/examples/13_two_tensor_op_fusion/b2b_conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.h
+++ b/examples/13_two_tensor_op_fusion/fused_two_convs_s8_sm75_rf.cu
@@ -1,30 +1,33 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
-/*! \file
-    \brief Tests for device-wide GEMM interface
-*/
 
 #include <iostream>
 
@@ -35,24 +38,25 @@
 
 #include "device/b2b_implicit_gemm_convolution.h"
 #include "b2b_interleaved_conv2d_run.h"
+#include "test_run.h"
 
 ////////////////////////////////////////////////////////////////////////////////
 
 cutlass::conv::Conv2dProblemSize conv2d_s8_sm75_problem_size_0 (
-    {128, 56, 56, 64},    // input size (NHWC)
+    {32, 56, 56, 64},    // input size (NHWC)
     {64, 3, 3, 64},   // filter size (KRSC)
     {1, 1, 1, 1},     // padding (pad_h, _, pad_w, _)
     {1, 1},           // stride (stride_h, stride_w)
     {1, 1},           // dilation (dilation_h, dilation_w)
-    {128, 56, 56, 64}     // output size (NPQK)
+    {32, 56, 56, 64}     // output size (NPQK)
   );
 cutlass::conv::Conv2dProblemSize conv2d_s8_sm75_problem_size_1 (
-    {128, 56, 56, 64},    // input size (NHWC)
-    {256, 1, 1, 64},   // filter size (KRSC)
+    {32, 56, 56, 64},    // input size (NHWC)
+    {128, 1, 1, 64},   // filter size (KRSC)
     {0, 0, 0, 0},     // padding (pad_h, _, pad_w, _)
     {1, 1},           // stride (stride_h, stride_w)
     {1, 1},           // dilation (dilation_h, dilation_w)
-    {128, 56, 56, 256}     // output size (NPQK)
+    {32, 56, 56, 128}     // output size (NPQK)
   );
 
 bool run_nonfused_conv2d_fprop_optimized_s8_sm75() {
@@ -137,7 +141,8 @@ bool run_nonfused_conv2d_fprop_optimized_s8_sm75() {
   return pass;
 }
 
-bool run_fused_conv2d_fprop_optimized_s8_sm75() {
+
+bool run_fused_conv2d_fprop_optimized_s8_sm75_rf_res() {
 
   using ElementA           = int8_t;
   using ElementB           = int8_t;
@@ -151,9 +156,9 @@ bool run_fused_conv2d_fprop_optimized_s8_sm75() {
   ElementCompute beta1 = ElementCompute(1);
 
   using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 32>;
-  using WarpShape0 = cutlass::gemm::GemmShape<32, 32, 32>;
-  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 256, 32>;
-  using WarpShape1 = cutlass::gemm::GemmShape<64, 64, 32>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 32>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 128, 32>;
+  using WarpShape1 = cutlass::gemm::GemmShape<32, 128, 32>;
   using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>;
 
   using EpilogueOutputOp0 = 
@@ -174,7 +179,7 @@ bool run_fused_conv2d_fprop_optimized_s8_sm75() {
     >;
 
 
-  const bool SmemAccumulator = true;
+  const bool SmemAccumulator = false;
 
   using B2bConv2dFpropKernel = typename cutlass::conv::kernel::DefaultB2bConv2dFprop<
     ElementA, cutlass::layout::TensorNCxHWx<32>,
@@ -201,7 +206,7 @@ bool run_fused_conv2d_fprop_optimized_s8_sm75() {
 
   B2bInterleavedFusedConv2dRun<B2bConv2dFprop, 32> fusedConv2d;
 
-  std::cout << "Running Fused back-to-back INT8 interleaved Optimized Convolution Fprops with shared memory staging...\n";
+  std::cout << "Running Fused back-to-back INT8 interleaved Optimized Convolution Fprops with RF residency...\n";
   bool pass = fusedConv2d.run(conv2d_s8_sm75_problem_size_0, conv2d_s8_sm75_problem_size_1, cutlass::conv::SplitKMode::kSerial,
       alpha0, beta0, alpha1, beta1);
 
@@ -213,81 +218,18 @@ bool run_fused_conv2d_fprop_optimized_s8_sm75() {
   return pass;
 }
 
-bool run_fused_conv2d_fprop_optimized_s8_sm75_rf_res() {
-
-  using ElementA           = int8_t;
-  using ElementB           = int8_t;
-  using ElementC           = int8_t;
-  using ElementAccumulator = int32_t;
-  using ElementCompute = float;
+int main() {
 
-  ElementCompute alpha0 = ElementCompute(1);
-  ElementCompute beta0 = ElementCompute(0);
-  ElementCompute alpha1 = ElementCompute(1);
-  ElementCompute beta1 = ElementCompute(1);
+  std::vector<bool (*)()>funcs = {
+    &run_nonfused_conv2d_fprop_optimized_s8_sm75,
+    &run_fused_conv2d_fprop_optimized_s8_sm75_rf_res
+  };
 
-  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 32>;
-  using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 32>;
-  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 256, 32>;
-  using WarpShape1 = cutlass::gemm::GemmShape<32, 256, 32>;
-  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>;
-
-  using EpilogueOutputOp0 = 
-    cutlass::epilogue::thread::LinearCombinationRelu<
-      ElementC,
-      InstructionShape::kM * InstructionShape::kN / 32,
-      ElementAccumulator,
-      ElementCompute,
-      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
-    >;
-
-  using EpilogueOutputOp1 = 
-    cutlass::epilogue::thread::LinearCombinationRelu<
-      ElementC,
-      64 / cutlass::sizeof_bits<ElementC>::value,
-      ElementAccumulator,
-      ElementCompute
-    >;
+  return testRun(75, funcs, "conv int8 RF residency");
 
+}
 
-  const bool SmemAccumulator = false;
-
-  using B2bConv2dFpropKernel = typename cutlass::conv::kernel::DefaultB2bConv2dFprop<
-    ElementA, cutlass::layout::TensorNCxHWx<32>,
-    ElementB, cutlass::layout::TensorCxRSKx<32>,
-    ElementC, cutlass::layout::TensorNCxHWx<32>,
-    ElementAccumulator,
-    cutlass::arch::OpClassTensorOp,
-    cutlass::arch::Sm75,
-    ThreadblockShape0,
-    ThreadblockShape1,
-    WarpShape0,
-    WarpShape1,
-    InstructionShape,
-    EpilogueOutputOp0,
-    EpilogueOutputOp1,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
-    2,
-    cutlass::arch::OpMultiplyAddSaturate,
-    cutlass::conv::IteratorAlgorithm::kOptimized,
-    SmemAccumulator
-  >::Kernel;
-  
-  using B2bConv2dFprop = cutlass::conv::device::B2bImplicitGemmConvolution<B2bConv2dFpropKernel>;
-
-  B2bInterleavedFusedConv2dRun<B2bConv2dFprop, 32> fusedConv2d;
-
-  std::cout << "Running Fused back-to-back INT8 interleaved Optimized Convolution Fprops with RF residency...\n";
-  bool pass = fusedConv2d.run(conv2d_s8_sm75_problem_size_0, conv2d_s8_sm75_problem_size_1, cutlass::conv::SplitKMode::kSerial,
-      alpha0, beta0, alpha1, beta1);
-
-  if(pass)
-    std::cout << "Pass\n";
-  else
-    std::cout << "Fail\n";
 
-  return pass;
-}
 
 ////////////////////////////////////////////////////////////////////////////////
 
diff --git a/examples/13_two_tensor_op_fusion/fused_two_convs_s8_sm75_shmem.cu b/examples/13_two_tensor_op_fusion/fused_two_convs_s8_sm75_shmem.cu
new file mode 100644
index 00000000..e7babbe4
--- /dev/null
+++ b/examples/13_two_tensor_op_fusion/fused_two_convs_s8_sm75_shmem.cu
@@ -0,0 +1,235 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "device/b2b_implicit_gemm_convolution.h"
+#include "b2b_interleaved_conv2d_run.h"
+#include "test_run.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+cutlass::conv::Conv2dProblemSize conv2d_s8_sm75_problem_size_0 (
+    {32, 56, 56, 64},    // input size (NHWC)
+    {64, 3, 3, 64},   // filter size (KRSC)
+    {1, 1, 1, 1},     // padding (pad_h, _, pad_w, _)
+    {1, 1},           // stride (stride_h, stride_w)
+    {1, 1},           // dilation (dilation_h, dilation_w)
+    {32, 56, 56, 64}     // output size (NPQK)
+  );
+cutlass::conv::Conv2dProblemSize conv2d_s8_sm75_problem_size_1 (
+    {32, 56, 56, 64},    // input size (NHWC)
+    {256, 1, 1, 64},   // filter size (KRSC)
+    {0, 0, 0, 0},     // padding (pad_h, _, pad_w, _)
+    {1, 1},           // stride (stride_h, stride_w)
+    {1, 1},           // dilation (dilation_h, dilation_w)
+    {32, 56, 56, 256}     // output size (NPQK)
+  );
+
+bool run_nonfused_conv2d_fprop_optimized_s8_sm75() {
+
+  using ElementA           = int8_t;
+  using ElementB           = int8_t;
+  using ElementC           = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  ElementCompute alpha0 = ElementCompute(1);
+  ElementCompute beta0 = ElementCompute(0);
+  ElementCompute alpha1 = ElementCompute(1);
+  ElementCompute beta1 = ElementCompute(1);
+
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 32, 64>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 256, 64>;
+  using WarpShape1 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>;
+
+  using Conv2dFpropKernel0 = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    ThreadblockShape0,
+    WarpShape0,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute,
+      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop0 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel0>;
+
+  using Conv2dFpropKernel1 = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    ThreadblockShape1,
+    WarpShape1,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop1 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel1>;
+
+  B2bInterleavedNonFusedConv2dRun<Conv2dFprop0, Conv2dFprop1, 32> nonFusedConv2d;
+
+  std::cout << "Running Non-fused back-to-back INT8 interleaved Optimized Convolution Fprops...\n";
+  bool pass = nonFusedConv2d.run(conv2d_s8_sm75_problem_size_0, conv2d_s8_sm75_problem_size_1, cutlass::conv::SplitKMode::kSerial,
+      alpha0, beta0, alpha1, beta1);
+
+  if(pass)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+
+  return pass;
+}
+
+bool run_fused_conv2d_fprop_optimized_s8_sm75_shmem() {
+
+  using ElementA           = int8_t;
+  using ElementB           = int8_t;
+  using ElementC           = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  ElementCompute alpha0 = ElementCompute(1);
+  ElementCompute beta0 = ElementCompute(0);
+  ElementCompute alpha1 = ElementCompute(1);
+  ElementCompute beta1 = ElementCompute(1);
+
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 32>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 32, 32>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 256, 32>;
+  using WarpShape1 = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>;
+
+  using EpilogueOutputOp0 = 
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      InstructionShape::kM * InstructionShape::kN / 32,
+      ElementAccumulator,
+      ElementCompute,
+      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
+    >;
+
+  using EpilogueOutputOp1 = 
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >;
+
+
+  const bool SmemAccumulator = true;
+
+  using B2bConv2dFpropKernel = typename cutlass::conv::kernel::DefaultB2bConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    ThreadblockShape0,
+    ThreadblockShape1,
+    WarpShape0,
+    WarpShape1,
+    InstructionShape,
+    EpilogueOutputOp0,
+    EpilogueOutputOp1,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kOptimized,
+    SmemAccumulator
+  >::Kernel;
+  
+  using B2bConv2dFprop = cutlass::conv::device::B2bImplicitGemmConvolution<B2bConv2dFpropKernel>;
+
+  B2bInterleavedFusedConv2dRun<B2bConv2dFprop, 32> fusedConv2d;
+
+  std::cout << "Running Fused back-to-back INT8 interleaved Optimized Convolution Fprops with shared memory staging...\n";
+  bool pass = fusedConv2d.run(conv2d_s8_sm75_problem_size_0, conv2d_s8_sm75_problem_size_1, cutlass::conv::SplitKMode::kSerial,
+      alpha0, beta0, alpha1, beta1);
+
+  if(pass)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+
+  return pass;
+}
+
+
+int main() {
+
+  std::vector<bool (*)()>funcs = {
+    &run_nonfused_conv2d_fprop_optimized_s8_sm75,
+    &run_fused_conv2d_fprop_optimized_s8_sm75_shmem
+  };
+
+  return testRun(75, funcs, "conv int8 shmem staging");
+
+}
+
+
+
+////////////////////////////////////////////////////////////////////////////////
+
diff --git a/examples/13_two_tensor_op_fusion/b2b_conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm80.h b/examples/13_two_tensor_op_fusion/fused_two_convs_s8_sm80_rf.cu
similarity index 74%
rename from examples/13_two_tensor_op_fusion/b2b_conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm80.h
rename to examples/13_two_tensor_op_fusion/fused_two_convs_s8_sm80_rf.cu
index 740434f8..ac193b99 100644
--- a/examples/13_two_tensor_op_fusion/b2b_conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm80.h
+++ b/examples/13_two_tensor_op_fusion/fused_two_convs_s8_sm80_rf.cu
@@ -1,30 +1,33 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
-/*! \file
-    \brief Tests for device-wide GEMM interface
-*/
 
 #include <iostream>
 
@@ -35,24 +38,25 @@
 
 #include "device/b2b_implicit_gemm_convolution.h"
 #include "b2b_interleaved_conv2d_run.h"
+#include "test_run.h"
 
 ////////////////////////////////////////////////////////////////////////////////
 
 cutlass::conv::Conv2dProblemSize conv2d_s8_sm80_problem_size_0 (
-    {128, 56, 56, 64},    // input size (NHWC)
+    {32, 56, 56, 64},    // input size (NHWC)
     {64, 3, 3, 64},   // filter size (KRSC)
     {1, 1, 1, 1},     // padding (pad_h, _, pad_w, _)
     {1, 1},           // stride (stride_h, stride_w)
     {1, 1},           // dilation (dilation_h, dilation_w)
-    {128, 56, 56, 64}     // output size (NPQK)
+    {32, 56, 56, 64}     // output size (NPQK)
   );
 cutlass::conv::Conv2dProblemSize conv2d_s8_sm80_problem_size_1 (
-    {128, 56, 56, 64},    // input size (NHWC)
-    {64, 1, 1, 64},   // filter size (KRSC)
+    {32, 56, 56, 64},    // input size (NHWC)
+    {128, 1, 1, 64},   // filter size (KRSC)
     {0, 0, 0, 0},     // padding (pad_h, _, pad_w, _)
     {1, 1},           // stride (stride_h, stride_w)
     {1, 1},           // dilation (dilation_h, dilation_w)
-    {128, 56, 56, 64}     // output size (NPQK)
+    {32, 56, 56, 128}     // output size (NPQK)
   );
 
 bool run_nonfused_conv2d_fprop_optimized_s8_sm80() {
@@ -69,8 +73,8 @@ bool run_nonfused_conv2d_fprop_optimized_s8_sm80() {
   ElementCompute beta1 = ElementCompute(0);
 
   using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
-  using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>;
-  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 32, 64>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 128, 64>;
   using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 64>;
   using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
 
@@ -137,7 +141,8 @@ bool run_nonfused_conv2d_fprop_optimized_s8_sm80() {
   return pass;
 }
 
-bool run_fused_conv2d_fprop_optimized_s8_sm80() {
+
+bool run_fused_conv2d_fprop_optimized_s8_sm80_rf_res() {
 
   using ElementA           = int8_t;
   using ElementB           = int8_t;
@@ -152,8 +157,8 @@ bool run_fused_conv2d_fprop_optimized_s8_sm80() {
 
   using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
   using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>;
-  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 64>;
-  using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 64>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 128, 64>;
+  using WarpShape1 = cutlass::gemm::GemmShape<32, 128, 64>;
   using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
 
   using EpilogueOutputOp0 = 
@@ -199,7 +204,7 @@ bool run_fused_conv2d_fprop_optimized_s8_sm80() {
 
   B2bInterleavedFusedConv2dRun<B2bConv2dFprop, 32> fusedConv2d;
 
-  std::cout << "Running Fused back-to-back INT8 interleaved Optimized Convolution Fprops...\n";
+  std::cout << "Running Fused back-to-back INT8 interleaved Optimized Convolution Fprops with RF residency...\n";
   bool pass = fusedConv2d.run(conv2d_s8_sm80_problem_size_0, conv2d_s8_sm80_problem_size_1, cutlass::conv::SplitKMode::kSerial,
       alpha0, beta0, alpha1, beta1);
 
@@ -211,6 +216,18 @@ bool run_fused_conv2d_fprop_optimized_s8_sm80() {
   return pass;
 }
 
+int main() {
+
+  std::vector<bool (*)()>funcs = {
+    &run_nonfused_conv2d_fprop_optimized_s8_sm80,
+    &run_fused_conv2d_fprop_optimized_s8_sm80_rf_res
+  };
+
+  return testRun(80, funcs, "conv int8 RF residency");
+
+
+}
+
 
 ////////////////////////////////////////////////////////////////////////////////
 
diff --git a/examples/13_two_tensor_op_fusion/fused_two_convs_s8_sm80_shmem.cu b/examples/13_two_tensor_op_fusion/fused_two_convs_s8_sm80_shmem.cu
new file mode 100644
index 00000000..07e87369
--- /dev/null
+++ b/examples/13_two_tensor_op_fusion/fused_two_convs_s8_sm80_shmem.cu
@@ -0,0 +1,234 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "device/b2b_implicit_gemm_convolution.h"
+#include "b2b_interleaved_conv2d_run.h"
+#include "test_run.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+cutlass::conv::Conv2dProblemSize conv2d_s8_sm80_problem_size_0 (
+    {32, 56, 56, 64},    // input size (NHWC)
+    {64, 3, 3, 64},   // filter size (KRSC)
+    {1, 1, 1, 1},     // padding (pad_h, _, pad_w, _)
+    {1, 1},           // stride (stride_h, stride_w)
+    {1, 1},           // dilation (dilation_h, dilation_w)
+    {32, 56, 56, 64}     // output size (NPQK)
+  );
+cutlass::conv::Conv2dProblemSize conv2d_s8_sm80_problem_size_1 (
+    {32, 56, 56, 64},    // input size (NHWC)
+    {256, 1, 1, 64},   // filter size (KRSC)
+    {0, 0, 0, 0},     // padding (pad_h, _, pad_w, _)
+    {1, 1},           // stride (stride_h, stride_w)
+    {1, 1},           // dilation (dilation_h, dilation_w)
+    {32, 56, 56, 256}     // output size (NPQK)
+  );
+
+bool run_nonfused_conv2d_fprop_optimized_s8_sm80() {
+
+  using ElementA           = int8_t;
+  using ElementB           = int8_t;
+  using ElementC           = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  ElementCompute alpha0 = ElementCompute(1);
+  ElementCompute beta0 = ElementCompute(0);
+  ElementCompute alpha1 = ElementCompute(1);
+  ElementCompute beta1 = ElementCompute(0);
+
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 32, 64>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 256, 64>;
+  using WarpShape1 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
+
+  using Conv2dFpropKernel0 = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape0,
+    WarpShape0,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute,
+      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    3,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop0 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel0>;
+
+  using Conv2dFpropKernel1 = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape1,
+    WarpShape1,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    3,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kOptimized
+  >::Kernel;
+
+  using Conv2dFprop1 = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel1>;
+
+  B2bInterleavedNonFusedConv2dRun<Conv2dFprop0, Conv2dFprop1, 32> nonFusedConv2d;
+
+  std::cout << "Running Non-fused back-to-back INT8 interleaved Optimized Convolution Fprops...\n";
+  bool pass = nonFusedConv2d.run(conv2d_s8_sm80_problem_size_0, conv2d_s8_sm80_problem_size_1, cutlass::conv::SplitKMode::kSerial,
+      alpha0, beta0, alpha1, beta1);
+
+  if(pass)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+
+  return pass;
+}
+
+bool run_fused_conv2d_fprop_optimized_s8_sm80_shmem() {
+
+  using ElementA           = int8_t;
+  using ElementB           = int8_t;
+  using ElementC           = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  ElementCompute alpha0 = ElementCompute(1);
+  ElementCompute beta0 = ElementCompute(0);
+  ElementCompute alpha1 = ElementCompute(1);
+  ElementCompute beta1 = ElementCompute(0);
+
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 32, 64>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 256, 64>;
+  using WarpShape1 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
+
+  using EpilogueOutputOp0 = 
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      8 * InstructionShape::kN / 32,
+      ElementAccumulator,
+      ElementCompute,
+      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
+    >;
+
+  using EpilogueOutputOp1 = 
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementC,
+      64 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >;
+
+  const bool SmemAccumulator = true;
+
+  using B2bConv2dFpropKernel = typename cutlass::conv::kernel::DefaultB2bConv2dFprop<
+    ElementA, cutlass::layout::TensorNCxHWx<32>,
+    ElementB, cutlass::layout::TensorCxRSKx<32>,
+    ElementC, cutlass::layout::TensorNCxHWx<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape0,
+    ThreadblockShape1,
+    WarpShape0,
+    WarpShape1,
+    InstructionShape,
+    EpilogueOutputOp0,
+    EpilogueOutputOp1,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    3,
+    cutlass::arch::OpMultiplyAddSaturate,
+    cutlass::conv::IteratorAlgorithm::kOptimized,
+    SmemAccumulator
+  >::Kernel;
+  
+  using B2bConv2dFprop = cutlass::conv::device::B2bImplicitGemmConvolution<B2bConv2dFpropKernel>;
+
+  B2bInterleavedFusedConv2dRun<B2bConv2dFprop, 32> fusedConv2d;
+
+  std::cout << "Running Fused back-to-back INT8 interleaved Optimized Convolution Fprops with shared memory staging...\n";
+  bool pass = fusedConv2d.run(conv2d_s8_sm80_problem_size_0, conv2d_s8_sm80_problem_size_1, cutlass::conv::SplitKMode::kSerial,
+      alpha0, beta0, alpha1, beta1);
+
+  if(pass)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+
+  return pass;
+}
+
+int main() {
+
+  std::vector<bool (*)()>funcs = {
+    &run_nonfused_conv2d_fprop_optimized_s8_sm80,
+    &run_fused_conv2d_fprop_optimized_s8_sm80_shmem
+  };
+
+  return testRun(80, funcs, "conv int8 shmem staging");
+
+
+}
+
+
+
+////////////////////////////////////////////////////////////////////////////////
+
diff --git a/examples/13_two_tensor_op_fusion/b2b_gemm_f16t_f16n_f16t_tensor_op_f16_sm75.h b/examples/13_two_tensor_op_fusion/fused_two_gemms_f16_sm75_rf.cu
similarity index 67%
rename from examples/13_two_tensor_op_fusion/b2b_gemm_f16t_f16n_f16t_tensor_op_f16_sm75.h
rename to examples/13_two_tensor_op_fusion/fused_two_gemms_f16_sm75_rf.cu
index 3ec45231..9ee5bc0a 100644
--- a/examples/13_two_tensor_op_fusion/b2b_gemm_f16t_f16n_f16t_tensor_op_f16_sm75.h
+++ b/examples/13_two_tensor_op_fusion/fused_two_gemms_f16_sm75_rf.cu
@@ -1,29 +1,33 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
-#pragma once
-
 #include <iostream>
 
 #include "cutlass/cutlass.h"
@@ -38,11 +42,12 @@
 
 #include "device/b2b_gemm.h"
 #include "b2b_gemm_run.h"
+#include "test_run.h"
 
 ////////////////////////////////////////////////////////////////////////////////
 
-cutlass::gemm::GemmCoord gemm_f16_sm75_problem_size_0(128*1600, 64, 576);
-cutlass::gemm::GemmCoord gemm_f16_sm75_problem_size_1(128*1600, 128, 64);
+cutlass::gemm::GemmCoord gemm_f16_sm75_problem_size_0(128*640, 64, 576);
+cutlass::gemm::GemmCoord gemm_f16_sm75_problem_size_1(128*640, 128, 64);
 
 bool run_nonfused_gemm_f16() {
 
@@ -55,10 +60,10 @@ bool run_nonfused_gemm_f16() {
   ElementCompute alpha1 = ElementCompute(2);
   ElementCompute beta1 = ElementCompute(1);
 
-  using ThreadblockShape0 = cutlass::gemm::GemmShape<128, 64, 64>;
-  using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>;
-  using ThreadblockShape1 = cutlass::gemm::GemmShape<128, 128, 32>;
-  using WarpShape1 = cutlass::gemm::GemmShape<64, 64, 32>;
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 32>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 32, 32>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 128, 32>;
+  using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 32>;
   using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
 
   using Gemm0 = cutlass::gemm::device::Gemm<
@@ -119,7 +124,8 @@ bool run_nonfused_gemm_f16() {
   return pass;
 }
 
-bool run_fused_gemm_f16() {
+
+bool run_fused_gemm_f16_rf_res() {
 
   using ElementOutput = cutlass::half_t;
   using ElementAccumulator = cutlass::half_t;
@@ -130,9 +136,9 @@ bool run_fused_gemm_f16() {
   ElementCompute alpha1 = ElementCompute(2);
   ElementCompute beta1 = ElementCompute(1);
 
-  using ThreadblockShape0 = cutlass::gemm::GemmShape<128, 64, 64>;
-  using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>;
-  using ThreadblockShape1 = cutlass::gemm::GemmShape<128, 128, 32>;
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 32>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 32>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 128, 32>;
   using WarpShape1 = cutlass::gemm::GemmShape<32, 128, 32>;
   using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
 
@@ -153,8 +159,6 @@ bool run_fused_gemm_f16() {
       ElementCompute
     >;
 
-
-
   using B2bGemm = cutlass::gemm::device::B2bGemm<
     cutlass::half_t,
     cutlass::layout::RowMajor,
@@ -178,7 +182,7 @@ bool run_fused_gemm_f16() {
 
   B2bFusedGemmRun<B2bGemm> fusedGemm;
 
-  std::cout << "Running Fused back-to-back FP16 TN GEMMs...\n";
+  std::cout << "Running Fused back-to-back FP16 TN GEMMs with RF Residency...\n";
   bool passed = fusedGemm.run(gemm_f16_sm75_problem_size_0, gemm_f16_sm75_problem_size_1, alpha0, beta0, alpha1, beta1);
   if(passed)
     std::cout << "Pass\n";
@@ -187,4 +191,17 @@ bool run_fused_gemm_f16() {
 
   return passed;
 }
-////////////////////////////////////////////////////////////////////////////////
+
+int main() {
+
+  std::vector<bool (*)()>funcs = {
+    &run_nonfused_gemm_f16,
+    &run_fused_gemm_f16_rf_res
+  };
+
+  return testRun(75, funcs, "gemm f16 RF residency");
+
+
+}
+
+///////////////////////////////////////////////////////////////////////////////
diff --git a/examples/13_two_tensor_op_fusion/fused_two_gemms_f16_sm75_shmem.cu b/examples/13_two_tensor_op_fusion/fused_two_gemms_f16_sm75_shmem.cu
new file mode 100644
index 00000000..0c9aa0e2
--- /dev/null
+++ b/examples/13_two_tensor_op_fusion/fused_two_gemms_f16_sm75_shmem.cu
@@ -0,0 +1,211 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/gemm.h"
+
+#include "device/b2b_gemm.h"
+#include "b2b_gemm_run.h"
+#include "test_run.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+cutlass::gemm::GemmCoord gemm_f16_sm75_problem_size_0(128*640, 64, 576);
+cutlass::gemm::GemmCoord gemm_f16_sm75_problem_size_1(128*640, 256, 64);
+
+bool run_nonfused_gemm_f16() {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+  using ElementCompute = cutlass::half_t;
+
+  ElementCompute alpha0 = ElementCompute(2);
+  ElementCompute beta0 = ElementCompute(0);
+  ElementCompute alpha1 = ElementCompute(2);
+  ElementCompute beta1 = ElementCompute(1);
+
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 32>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 32, 32>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 256, 32>;
+  using WarpShape1 = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+
+  using Gemm0 = cutlass::gemm::device::Gemm<
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    ThreadblockShape0,
+    WarpShape0,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute,
+      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2
+  >;
+  using Gemm1 = cutlass::gemm::device::Gemm<
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    ThreadblockShape1,
+    WarpShape1,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2
+  >;
+
+  B2bNonFusedGemmRun<Gemm0, Gemm1> nonFusedGemm;
+
+  std::cout << "Running Non-fused back-to-back FP16 TN GEMMs...\n";
+  bool pass = nonFusedGemm.run(gemm_f16_sm75_problem_size_0, gemm_f16_sm75_problem_size_1, alpha0, beta0, alpha1, beta1);
+  if(pass)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+
+  return pass;
+}
+
+bool run_fused_gemm_f16_shmem() {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+  using ElementCompute = cutlass::half_t;
+
+  ElementCompute alpha0 = ElementCompute(2);
+  ElementCompute beta0 = ElementCompute(0);
+  ElementCompute alpha1 = ElementCompute(2);
+  ElementCompute beta1 = ElementCompute(1);
+
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 32>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 32, 32>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 256, 32>;
+  using WarpShape1 = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+
+  using EpilogueOutputOp0 = 
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementOutput,
+      InstructionShape::kM * InstructionShape::kN / 32,
+      ElementAccumulator,
+      ElementCompute,
+      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
+    >;
+
+  using EpilogueOutputOp1 = 
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >;
+
+
+  const bool SmemAccumulator = true;
+
+  using B2bGemm = cutlass::gemm::device::B2bGemm<
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    ThreadblockShape0,
+    ThreadblockShape1,
+    WarpShape0,
+    WarpShape1,
+    InstructionShape,
+    EpilogueOutputOp0,
+    EpilogueOutputOp1,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2,
+    SmemAccumulator
+  >;
+
+  B2bFusedGemmRun<B2bGemm> fusedGemm;
+
+  std::cout << "Running Fused back-to-back FP16 TN GEMMs with shared memory staging...\n";
+  bool passed = fusedGemm.run(gemm_f16_sm75_problem_size_0, gemm_f16_sm75_problem_size_1, alpha0, beta0, alpha1, beta1);
+  if(passed)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+
+  return passed;
+}
+
+int main() {
+
+  std::vector<bool (*)()>funcs = {
+    &run_nonfused_gemm_f16,
+    &run_fused_gemm_f16_shmem
+  };
+
+  return testRun(75, funcs, "gemm f16 shmem staging");
+
+
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/13_two_tensor_op_fusion/b2b_gemm_f16t_f16n_f16t_tensor_op_f16_sm80.h b/examples/13_two_tensor_op_fusion/fused_two_gemms_f16_sm80_rf.cu
similarity index 74%
rename from examples/13_two_tensor_op_fusion/b2b_gemm_f16t_f16n_f16t_tensor_op_f16_sm80.h
rename to examples/13_two_tensor_op_fusion/fused_two_gemms_f16_sm80_rf.cu
index 93cb6177..6610737a 100644
--- a/examples/13_two_tensor_op_fusion/b2b_gemm_f16t_f16n_f16t_tensor_op_f16_sm80.h
+++ b/examples/13_two_tensor_op_fusion/fused_two_gemms_f16_sm80_rf.cu
@@ -1,29 +1,33 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
-#pragma once
-
 #include <iostream>
 
 #include "cutlass/cutlass.h"
@@ -38,11 +42,12 @@
 
 #include "device/b2b_gemm.h"
 #include "b2b_gemm_run.h"
+#include "test_run.h"
 
 ////////////////////////////////////////////////////////////////////////////////
 
-cutlass::gemm::GemmCoord gemm_f16_sm80_problem_size_0(128*1600, 64, 576);
-cutlass::gemm::GemmCoord gemm_f16_sm80_problem_size_1(128*1600, 128, 64);
+cutlass::gemm::GemmCoord gemm_f16_sm80_problem_size_0(128*640, 64, 576);
+cutlass::gemm::GemmCoord gemm_f16_sm80_problem_size_1(128*640, 128, 64);
 
 bool run_nonfused_gemm_f16_sm80() {
 
@@ -119,7 +124,7 @@ bool run_nonfused_gemm_f16_sm80() {
   return pass;
 }
 
-bool run_fused_gemm_f16_sm80() {
+bool run_fused_gemm_f16_sm80_rf_res() {
 
   using ElementOutput = cutlass::half_t;
   using ElementAccumulator = cutlass::half_t;
@@ -178,7 +183,7 @@ bool run_fused_gemm_f16_sm80() {
 
   B2bFusedGemmRun<B2bGemm> fusedGemm;
 
-  std::cout << "Running Fused back-to-back FP16 TN GEMMs...\n";
+  std::cout << "Running Fused back-to-back FP16 TN GEMMs with RF residency...\n";
   bool passed = fusedGemm.run(gemm_f16_sm80_problem_size_0, gemm_f16_sm80_problem_size_1, alpha0, beta0, alpha1, beta1);
   if(passed)
     std::cout << "Pass\n";
@@ -188,4 +193,20 @@ bool run_fused_gemm_f16_sm80() {
   return passed;
 
 }
+
+int main() {
+
+  std::vector<bool (*)()>funcs = {
+    &run_nonfused_gemm_f16_sm80,
+    &run_fused_gemm_f16_sm80_rf_res
+  };
+
+  return testRun(80, funcs, "gemm f16 RF residency");
+
+
+}
+
+
+
+
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/13_two_tensor_op_fusion/fused_two_gemms_f16_sm80_shmem.cu b/examples/13_two_tensor_op_fusion/fused_two_gemms_f16_sm80_shmem.cu
new file mode 100644
index 00000000..48c31ae0
--- /dev/null
+++ b/examples/13_two_tensor_op_fusion/fused_two_gemms_f16_sm80_shmem.cu
@@ -0,0 +1,214 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/gemm.h"
+
+#include "device/b2b_gemm.h"
+#include "b2b_gemm_run.h"
+#include "test_run.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+cutlass::gemm::GemmCoord gemm_f16_sm80_problem_size_0(128*640, 64, 576);
+cutlass::gemm::GemmCoord gemm_f16_sm80_problem_size_1(128*640, 256, 64);
+
+bool run_nonfused_gemm_f16_sm80() {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+  using ElementCompute = cutlass::half_t;
+
+  ElementCompute alpha0 = ElementCompute(2);
+  ElementCompute beta0 = ElementCompute(0);
+  ElementCompute alpha1 = ElementCompute(2);
+  ElementCompute beta1 = ElementCompute(1);
+
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 32>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 32, 32>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 256, 32>;
+  using WarpShape1 = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+
+  using Gemm0 = cutlass::gemm::device::Gemm<
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape0,
+    WarpShape0,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute,
+      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    3
+  >;
+  using Gemm1 = cutlass::gemm::device::Gemm<
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape1,
+    WarpShape1,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    3
+  >;
+
+  B2bNonFusedGemmRun<Gemm0, Gemm1> nonFusedGemm;
+
+  std::cout << "Running Non-fused back-to-back FP16 TN GEMMs...\n";
+  bool pass = nonFusedGemm.run(gemm_f16_sm80_problem_size_0, gemm_f16_sm80_problem_size_1, alpha0, beta0, alpha1, beta1);
+  if(pass)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+
+  return pass;
+}
+
+bool run_fused_gemm_f16_sm80_shmem() {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = cutlass::half_t;
+  using ElementCompute = cutlass::half_t;
+
+  ElementCompute alpha0 = ElementCompute(2);
+  ElementCompute beta0 = ElementCompute(0);
+  ElementCompute alpha1 = ElementCompute(2);
+  ElementCompute beta1 = ElementCompute(1);
+
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 32>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 32, 32>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 256, 32>;
+  using WarpShape1 = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+
+  using EpilogueOutputOp0 = 
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementOutput,
+      InstructionShape::kM * InstructionShape::kN / 32,
+      ElementAccumulator,
+      ElementCompute,
+      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
+    >;
+
+  using EpilogueOutputOp1 = 
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >;
+
+
+  const bool SmemAccumulator = true;
+
+  using B2bGemm = cutlass::gemm::device::B2bGemm<
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape0,
+    ThreadblockShape1,
+    WarpShape0,
+    WarpShape1,
+    InstructionShape,
+    EpilogueOutputOp0,
+    EpilogueOutputOp1,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    3,
+    SmemAccumulator
+  >;
+
+  B2bFusedGemmRun<B2bGemm> fusedGemm;
+
+  std::cout << "Running Fused back-to-back FP16 TN GEMMs with shared memory staging...\n";
+  bool passed = fusedGemm.run(gemm_f16_sm80_problem_size_0, gemm_f16_sm80_problem_size_1, alpha0, beta0, alpha1, beta1);
+  if(passed)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+
+  return passed;
+
+}
+
+
+int main() {
+
+  std::vector<bool (*)()>funcs = {
+    &run_nonfused_gemm_f16_sm80,
+    &run_fused_gemm_f16_sm80_shmem
+  };
+
+  return testRun(80, funcs, "gemm f16 shmem staging");
+
+
+}
+
+
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/13_two_tensor_op_fusion/b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm75.h b/examples/13_two_tensor_op_fusion/fused_two_gemms_s8_sm75_rf.cu
similarity index 70%
rename from examples/13_two_tensor_op_fusion/b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm75.h
rename to examples/13_two_tensor_op_fusion/fused_two_gemms_s8_sm75_rf.cu
index fd5d841b..2fff4d84 100644
--- a/examples/13_two_tensor_op_fusion/b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm75.h
+++ b/examples/13_two_tensor_op_fusion/fused_two_gemms_s8_sm75_rf.cu
@@ -1,29 +1,33 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
-#pragma once
-
 #include <iostream>
 
 #include "cutlass/cutlass.h"
@@ -38,11 +42,12 @@
 
 #include "device/b2b_gemm.h"
 #include "b2b_interleaved_gemm_run.h"
+#include "test_run.h"
 
 ////////////////////////////////////////////////////////////////////////////////
 
-cutlass::gemm::GemmCoord gemm_s8_sm75_problem_size_0(128*1600, 64, 576);
-cutlass::gemm::GemmCoord gemm_s8_sm75_problem_size_1(128*1600, 128, 64);
+cutlass::gemm::GemmCoord gemm_s8_sm75_problem_size_0(128*640, 64, 576);
+cutlass::gemm::GemmCoord gemm_s8_sm75_problem_size_1(128*640, 128, 64);
 
 bool run_nonfused_gemm_s8() {
 
@@ -57,8 +62,8 @@ bool run_nonfused_gemm_s8() {
 
   using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
   using WarpShape0 = cutlass::gemm::GemmShape<32, 32, 64>;
-  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 64>;
-  using WarpShape1 = cutlass::gemm::GemmShape<32, 32, 64>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 128, 64>;
+  using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 64>;
   using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>;
 
   using Gemm0 = cutlass::gemm::device::Gemm<
@@ -119,7 +124,8 @@ bool run_nonfused_gemm_s8() {
   return pass;
 }
 
-bool run_fused_gemm_s8() {
+
+bool run_fused_gemm_s8_rf_res() {
 
   using ElementOutput = int8_t;
   using ElementAccumulator = int32_t;
@@ -130,10 +136,10 @@ bool run_fused_gemm_s8() {
   ElementCompute alpha1 = ElementCompute(2);
   ElementCompute beta1 = ElementCompute(1);
 
-  using ThreadblockShape0 = cutlass::gemm::GemmShape<128, 64, 64>;
-  using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>;
-  using ThreadblockShape1 = cutlass::gemm::GemmShape<128, 128, 64>;
-  using WarpShape1 = cutlass::gemm::GemmShape<32, 128, 64>;
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 32>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 32>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 128, 32>;
+  using WarpShape1 = cutlass::gemm::GemmShape<32, 128, 32>;
   using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>;
 
   using EpilogueOutputOp0 = 
@@ -176,7 +182,7 @@ bool run_fused_gemm_s8() {
 
   B2bInterleavedFusedGemmRun<B2bGemm, 32> fusedGemm;
 
-  std::cout << "Running Fused back-to-back INT8 NT interleaved GEMMs...\n";
+  std::cout << "Running Fused back-to-back INT8 NT interleaved GEMMs with RF Residency...\n";
   bool passed = fusedGemm.run(gemm_s8_sm75_problem_size_0, gemm_s8_sm75_problem_size_1, alpha0, beta0, alpha1, beta1);
   if(passed)
     std::cout << "Pass\n";
@@ -186,4 +192,18 @@ bool run_fused_gemm_s8() {
   return passed;
 
 }
+
+int main() {
+
+  std::vector<bool (*)()>funcs = {
+    &run_nonfused_gemm_s8,
+    &run_fused_gemm_s8_rf_res
+  };
+
+  return testRun(75, funcs, "gemm f16 RF residency");
+
+
+}
+
+
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/13_two_tensor_op_fusion/fused_two_gemms_s8_sm75_shmem.cu b/examples/13_two_tensor_op_fusion/fused_two_gemms_s8_sm75_shmem.cu
new file mode 100644
index 00000000..952688d4
--- /dev/null
+++ b/examples/13_two_tensor_op_fusion/fused_two_gemms_s8_sm75_shmem.cu
@@ -0,0 +1,211 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/gemm.h"
+
+#include "device/b2b_gemm.h"
+#include "b2b_interleaved_gemm_run.h"
+#include "test_run.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+cutlass::gemm::GemmCoord gemm_s8_sm75_problem_size_0(128*640, 64, 576);
+cutlass::gemm::GemmCoord gemm_s8_sm75_problem_size_1(128*640, 256, 64);
+
+bool run_nonfused_gemm_s8() {
+
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  ElementCompute alpha0 = ElementCompute(2);
+  ElementCompute beta0 = ElementCompute(0);
+  ElementCompute alpha1 = ElementCompute(2);
+  ElementCompute beta1 = ElementCompute(1);
+
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 32, 64>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 256, 64>;
+  using WarpShape1 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>;
+
+  using Gemm0 = cutlass::gemm::device::Gemm<
+    int8_t,
+    cutlass::layout::ColumnMajorInterleaved<32>,
+    int8_t,
+    cutlass::layout::RowMajorInterleaved<32>,
+    ElementOutput,
+    cutlass::layout::ColumnMajorInterleaved<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    ThreadblockShape0,
+    WarpShape0,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute,
+      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2
+  >;
+  using Gemm1 = cutlass::gemm::device::Gemm<
+    int8_t,
+    cutlass::layout::ColumnMajorInterleaved<32>,
+    int8_t,
+    cutlass::layout::RowMajorInterleaved<32>,
+    ElementOutput,
+    cutlass::layout::ColumnMajorInterleaved<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    ThreadblockShape1,
+    WarpShape1,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2
+  >;
+
+  B2bInterleavedNonFusedGemmRun<Gemm0, Gemm1, 32> nonFusedGemm;
+
+  std::cout << "Running Non-fused back-to-back INT8 NT interleaved GEMMs...\n";
+  bool pass = nonFusedGemm.run(gemm_s8_sm75_problem_size_0, gemm_s8_sm75_problem_size_1, alpha0, beta0, alpha1, beta1);
+  if(pass)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+
+  return pass;
+}
+
+bool run_fused_gemm_s8_shmem() {
+
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  ElementCompute alpha0 = ElementCompute(2);
+  ElementCompute beta0 = ElementCompute(0);
+  ElementCompute alpha1 = ElementCompute(2);
+  ElementCompute beta1 = ElementCompute(1);
+
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 32>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 32, 32>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 256, 32>;
+  using WarpShape1 = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>;
+
+  using EpilogueOutputOp0 = 
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementOutput,
+      InstructionShape::kM * InstructionShape::kN / 32,
+      ElementAccumulator,
+      ElementCompute,
+      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
+    >;
+
+  using EpilogueOutputOp1 = 
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >;
+  
+  const bool SmemAccumulator = true;
+
+  using B2bGemm = cutlass::gemm::device::B2bGemm<
+    int8_t,
+    cutlass::layout::ColumnMajorInterleaved<32>,
+    int8_t,
+    cutlass::layout::RowMajorInterleaved<32>,
+    ElementOutput,
+    cutlass::layout::ColumnMajorInterleaved<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    ThreadblockShape0,
+    ThreadblockShape1,
+    WarpShape0,
+    WarpShape1,
+    InstructionShape,
+    EpilogueOutputOp0,
+    EpilogueOutputOp1,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
+    2,
+    SmemAccumulator
+  >;
+
+  B2bInterleavedFusedGemmRun<B2bGemm, 32> fusedGemm;
+
+  std::cout << "Running Fused back-to-back INT8 NT interleaved GEMMs with shared memory staging...\n";
+  bool passed = fusedGemm.run(gemm_s8_sm75_problem_size_0, gemm_s8_sm75_problem_size_1, alpha0, beta0, alpha1, beta1);
+  if(passed)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+
+  return passed;
+
+}
+
+int main() {
+
+  std::vector<bool (*)()>funcs = {
+    &run_nonfused_gemm_s8,
+    &run_fused_gemm_s8_shmem
+  };
+
+  return testRun(75, funcs, "gemm s8 shmem staing");
+
+
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/13_two_tensor_op_fusion/b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm80.h b/examples/13_two_tensor_op_fusion/fused_two_gemms_s8_sm80_rf.cu
similarity index 72%
rename from examples/13_two_tensor_op_fusion/b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm80.h
rename to examples/13_two_tensor_op_fusion/fused_two_gemms_s8_sm80_rf.cu
index 270bea5f..a4405504 100644
--- a/examples/13_two_tensor_op_fusion/b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm80.h
+++ b/examples/13_two_tensor_op_fusion/fused_two_gemms_s8_sm80_rf.cu
@@ -1,29 +1,33 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
-#pragma once
-
 #include <iostream>
 
 #include "cutlass/cutlass.h"
@@ -38,11 +42,12 @@
 
 #include "device/b2b_gemm.h"
 #include "b2b_interleaved_gemm_run.h"
+#include "test_run.h"
 
 ////////////////////////////////////////////////////////////////////////////////
 
-cutlass::gemm::GemmCoord gemm_s8_sm80_problem_size_0(128*1600, 64, 576);
-cutlass::gemm::GemmCoord gemm_s8_sm80_problem_size_1(128*1600, 128, 64);
+cutlass::gemm::GemmCoord gemm_s8_sm80_problem_size_0(128*640, 64, 576);
+cutlass::gemm::GemmCoord gemm_s8_sm80_problem_size_1(128*640, 128, 64);
 
 bool run_nonfused_gemm_s8_sm80() {
 
@@ -55,10 +60,10 @@ bool run_nonfused_gemm_s8_sm80() {
   ElementCompute alpha1 = ElementCompute(2);
   ElementCompute beta1 = ElementCompute(0);
 
-  using ThreadblockShape0 = cutlass::gemm::GemmShape<128, 64, 64>;
-  using WarpShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
-  using ThreadblockShape1 = cutlass::gemm::GemmShape<128, 128, 64>;
-  using WarpShape1 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 32, 64>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 128, 64>;
+  using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 64>;
   using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
 
   using Gemm0 = cutlass::gemm::device::Gemm<
@@ -128,7 +133,8 @@ bool run_nonfused_gemm_s8_sm80() {
   return pass;
 }
 
-bool run_fused_gemm_s8_sm80() {
+
+bool run_fused_gemm_s8_sm80_rf_res() {
 
   using ElementOutput = int8_t;
   using ElementAccumulator = int32_t;
@@ -163,6 +169,8 @@ bool run_fused_gemm_s8_sm80() {
       cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
     >;
 
+  const bool SmemAccumulator = false;
+
   using B2bGemm = cutlass::gemm::device::B2bGemm<
     int8_t,
     cutlass::layout::ColumnMajorInterleaved<32>,
@@ -182,6 +190,7 @@ bool run_fused_gemm_s8_sm80() {
     EpilogueOutputOp1,
     cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     3,
+    SmemAccumulator,
     16,
     16,
     false,
@@ -190,7 +199,7 @@ bool run_fused_gemm_s8_sm80() {
 
   B2bInterleavedFusedGemmRun<B2bGemm, 32> fusedGemm;
 
-  std::cout << "Running Fused back-to-back INT8 NT interleaved GEMMs...\n";
+  std::cout << "Running Fused back-to-back INT8 NT interleaved GEMMs with RF residency...\n";
   bool passed = fusedGemm.run(gemm_s8_sm80_problem_size_0, gemm_s8_sm80_problem_size_1, alpha0, beta0, alpha1, beta1);
   if(passed)
     std::cout << "Pass\n";
@@ -199,4 +208,19 @@ bool run_fused_gemm_s8_sm80() {
 
   return passed;
 }
+
+
+int main() {
+
+  std::vector<bool (*)()>funcs = {
+    &run_nonfused_gemm_s8_sm80,
+    &run_fused_gemm_s8_sm80_rf_res
+  };
+
+  return testRun(80, funcs, "gemm int8 RF residency");
+
+
+}
+
+
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/13_two_tensor_op_fusion/fused_two_gemms_s8_sm80_shmem.cu b/examples/13_two_tensor_op_fusion/fused_two_gemms_s8_sm80_shmem.cu
new file mode 100644
index 00000000..c88d9df1
--- /dev/null
+++ b/examples/13_two_tensor_op_fusion/fused_two_gemms_s8_sm80_shmem.cu
@@ -0,0 +1,225 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/gemm.h"
+
+#include "device/b2b_gemm.h"
+#include "b2b_interleaved_gemm_run.h"
+#include "test_run.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+cutlass::gemm::GemmCoord gemm_s8_sm80_problem_size_0(128*640, 64, 576);
+cutlass::gemm::GemmCoord gemm_s8_sm80_problem_size_1(128*640, 256, 64);
+
+bool run_nonfused_gemm_s8_sm80() {
+
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  ElementCompute alpha0 = ElementCompute(2);
+  ElementCompute beta0 = ElementCompute(0);
+  ElementCompute alpha1 = ElementCompute(2);
+  ElementCompute beta1 = ElementCompute(0);
+
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 32, 64>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 128, 64>;
+  using WarpShape1 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
+
+  using Gemm0 = cutlass::gemm::device::Gemm<
+    int8_t,
+    cutlass::layout::ColumnMajorInterleaved<32>,
+    int8_t,
+    cutlass::layout::RowMajorInterleaved<32>,
+    ElementOutput,
+    cutlass::layout::ColumnMajorInterleaved<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape0,
+    WarpShape0,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute,
+      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    16,
+    16,
+    false,
+    cutlass::arch::OpMultiplyAddSaturate
+  >;
+  using Gemm1 = cutlass::gemm::device::Gemm<
+    int8_t,
+    cutlass::layout::ColumnMajorInterleaved<32>,
+    int8_t,
+    cutlass::layout::RowMajorInterleaved<32>,
+    ElementOutput,
+    cutlass::layout::ColumnMajorInterleaved<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape1,
+    WarpShape1,
+    InstructionShape,
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute,
+      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    16,
+    16,
+    false,
+    cutlass::arch::OpMultiplyAddSaturate
+  >;
+
+  B2bInterleavedNonFusedGemmRun<Gemm0, Gemm1, 32> nonFusedGemm;
+
+  std::cout << "Running Non-fused back-to-back INT8 NT interleaved GEMMs...\n";
+  bool pass = nonFusedGemm.run(gemm_s8_sm80_problem_size_0, gemm_s8_sm80_problem_size_1, alpha0, beta0, alpha1, beta1);
+  if(pass)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+
+  return pass;
+}
+
+bool run_fused_gemm_s8_sm80_shmem() {
+
+  using ElementOutput = int8_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  ElementCompute alpha0 = ElementCompute(2);
+  ElementCompute beta0 = ElementCompute(0);
+  ElementCompute alpha1 = ElementCompute(2);
+  ElementCompute beta1 = ElementCompute(0);
+
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape0 = cutlass::gemm::GemmShape<32, 32, 64>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 256, 64>;
+  using WarpShape1 = cutlass::gemm::GemmShape<64, 64, 64>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>;
+
+  using EpilogueOutputOp0 = 
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementOutput,
+      8 * InstructionShape::kN / 32,
+      ElementAccumulator,
+      ElementCompute,
+      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
+    >;
+
+  using EpilogueOutputOp1 = 
+    cutlass::epilogue::thread::LinearCombinationRelu<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute,
+      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
+    >;
+  
+  const bool SmemAccumulator = true;
+
+  using B2bGemm = cutlass::gemm::device::B2bGemm<
+    int8_t,
+    cutlass::layout::ColumnMajorInterleaved<32>,
+    int8_t,
+    cutlass::layout::RowMajorInterleaved<32>,
+    ElementOutput,
+    cutlass::layout::ColumnMajorInterleaved<32>,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape0,
+    ThreadblockShape1,
+    WarpShape0,
+    WarpShape1,
+    InstructionShape,
+    EpilogueOutputOp0,
+    EpilogueOutputOp1,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    SmemAccumulator,
+    16,
+    16,
+    false,
+    cutlass::arch::OpMultiplyAddSaturate
+  >;
+
+  B2bInterleavedFusedGemmRun<B2bGemm, 32> fusedGemm;
+
+  std::cout << "Running Fused back-to-back INT8 NT interleaved GEMMs with shared memory staging...\n";
+  bool passed = fusedGemm.run(gemm_s8_sm80_problem_size_0, gemm_s8_sm80_problem_size_1, alpha0, beta0, alpha1, beta1);
+  if(passed)
+    std::cout << "Pass\n";
+  else
+    std::cout << "Fail\n";
+
+  return passed;
+}
+
+
+int main() {
+
+  std::vector<bool (*)()>funcs = {
+    &run_nonfused_gemm_s8_sm80,
+    &run_fused_gemm_s8_sm80_shmem
+  };
+
+  return testRun(80, funcs, "gemm int8 shmem staging");
+
+
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/13_two_tensor_op_fusion/kernel/b2b_gemm.h b/examples/13_two_tensor_op_fusion/kernel/b2b_gemm.h
index 7bc22bb1..53880b21 100644
--- a/examples/13_two_tensor_op_fusion/kernel/b2b_gemm.h
+++ b/examples/13_two_tensor_op_fusion/kernel/b2b_gemm.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -202,6 +208,19 @@ struct B2bGemm {
       return Status::kErrorMisalignedOperand;
     }
 
+    // Determine if fusion sizes are valid
+    if(problem_size_0.m() != problem_size_1.m())
+      return Status::kErrorInvalidProblem;
+
+    if(problem_size_0.n() != problem_size_1.k())
+      return Status::kErrorInvalidProblem;
+
+    if(problem_size_0.n() > B2bMma::Shape0::kN)
+      return Status::kErrorInvalidProblem;
+    
+    if(problem_size_1.n() > B2bMma::Shape1::kN)
+      return Status::kErrorInvalidProblem;
+
     return Status::kSuccess;
   }
 
diff --git a/examples/13_two_tensor_op_fusion/kernel/b2b_implicit_gemm_convolution.h b/examples/13_two_tensor_op_fusion/kernel/b2b_implicit_gemm_convolution.h
index b9246c20..8ebc10c0 100644
--- a/examples/13_two_tensor_op_fusion/kernel/b2b_implicit_gemm_convolution.h
+++ b/examples/13_two_tensor_op_fusion/kernel/b2b_implicit_gemm_convolution.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop.h b/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop.h
index 1ae49ae1..da7c9aef 100644
--- a/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop.h
+++ b/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -48,7 +54,6 @@
 #include "kernel/b2b_implicit_gemm_convolution.h"
 #include "threadblock/b2b_implicit_gemm_pipelined.h"
 #include "threadblock/b2b_implicit_gemm_multistage.h"
-#include "threadblock/b2b_implicit_gemm_pipelined_smem_accumulator.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -82,2098 +87,6 @@ template <
   bool SmemAccumulator = false
 > struct DefaultB2bConv2dFprop;
 
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//                         OpClassTensorOp convolutions 
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv2dFprop specialzation for Analytic IteratorAlgorithm and multistage 
-/// pipeline.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape0,
-  typename ThreadblockShape1,
-  typename WarpShape0,
-  typename WarpShape1,
-  typename InstructionShape,
-  typename EpilogueOutputOp0,
-  typename EpilogueOutputOp1,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag
->
-struct DefaultB2bConv2dFprop <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassTensorOp,
-  ArchTag,
-  ThreadblockShape0,
-  ThreadblockShape1,
-  WarpShape0,
-  WarpShape1,
-  InstructionShape,
-  EpilogueOutputOp0,
-  EpilogueOutputOp1,
-  ThreadblockSwizzle,
-  Stages,
-  MathOperatorTag,
-  IteratorAlgorithm::kAnalytic
-> {
-
-  // Define the core components from GEMM
-  using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape0, WarpShape0, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
-      Stages, MathOperatorTag>;
-  using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape1, WarpShape1, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
-      Stages, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA0 = typename MmaCore0::IteratorThreadMapA;
-  using IteratorA0 =
-    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
-      cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
-      ElementA, LayoutA,
-      ThreadMapA0
-    >;
-
-  using SmemIteratorA0 = typename MmaCore0::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB0 = typename MmaCore0::IteratorThreadMapB;
-  using IteratorB0 =
-    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
-      cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
-      ElementB, LayoutB,
-      ThreadMapB0
-    >;
-  
-  using SmemIteratorB0 = typename MmaCore0::SmemIteratorB;
-
-  // Use fragment iterator for A operand
-  using AccumulatorLayout = cutlass::layout::ColumnMajor;
-  using FragmentIteratorA1 = 
-      cutlass::gemm::warp::MmaTensorOpFragmentIterator<
-          cutlass::MatrixShape<MmaCore1::WarpShape::kM, MmaCore1::InstructionShape::kK>, //warp shape
-          cutlass::MatrixShape<MmaCore0::WarpShape::kM, MmaCore0::WarpShape::kN>, //accumulator shape
-          MmaCore1::Shape::kK, //kBlocksColumn
-          ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp0>;
-
-  /// Define iterators over tiles from scale/bias vectors
-  using ElementScaleBias = typename EpilogueOutputOp0::ElementCompute;
-  using LayoutScaleBias = layout::RowMajor; //vector layout doesn't really matter
-  static int const kElementsPerAccess = 2;
-  using IteratorAccumulatorScaleBias =
-    cutlass::transform::threadblock::VectorIterator<
-      cutlass::transform::threadblock::PredicatedVectorAccessIterator<
-          cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kN>,
-          cutlass::MatrixShape<WarpShape1::kM, WarpShape1::kK>,
-          ElementScaleBias, LayoutScaleBias, kElementsPerAccess>
-    >;
-
-  // Warp-level iterators to load scale and bias vectors
-  using FragmentIteratorA1ScaleBias = cutlass::transform::warp::VectorFragmentIterator<
-      MatrixShape<1, IteratorAccumulatorScaleBias::Fragment::kElements>, ElementScaleBias,
-      LayoutScaleBias, InstructionShape, kElementsPerAccess>;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB1 = typename MmaCore1::IteratorThreadMapB;
-  using IteratorB1 =
-    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
-      cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
-      ElementB, LayoutB,
-      ThreadMapB1
-    >;
-  
-  using SmemIteratorB1 = typename MmaCore1::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp;
-  using MmaPolicy0 = typename MmaCore0::MmaPolicy;
-  using MmaPolicy1 = typename MmaCore1::MmaPolicy;
-
-  // Define the Mma
-  using B2bMma = threadblock::B2bImplicitGemmMultistage<
-    ThreadblockShape0,
-    IteratorA0,
-    SmemIteratorA0,
-    arch::CacheOperation::Always,
-    IteratorB0,
-    SmemIteratorB0,
-    arch::CacheOperation::Global,
-    ThreadblockShape1,
-    FragmentIteratorA1,
-    IteratorAccumulatorScaleBias,
-    FragmentIteratorA1ScaleBias,
-    IteratorB1,
-    SmemIteratorB1,
-    arch::CacheOperation::Global,
-    EpilogueOutputOp0,
-    MmaPolicy0,
-    MmaPolicy1,
-    Stages 
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
-    ThreadblockShape1,
-    WarpMmaTensorOp1,
-    1,
-    EpilogueOutputOp1,
-    EpilogueOutputOp1::kCount
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::B2bImplicitGemmConvolution<
-    B2bMma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kFprop
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv2dFprop specialzation for Analytic IteratorAlgorithm and multistage 
-/// pipeline with interleaved layout.
-template <
-  typename ElementA,
-  typename ElementB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape0,
-  typename ThreadblockShape1,
-  typename WarpShape0,
-  typename WarpShape1,
-  typename InstructionShape,
-  typename EpilogueOutputOp0,
-  typename EpilogueOutputOp1,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag,
-  int InterleavedK
->
-struct DefaultB2bConv2dFprop <
-  ElementA,
-  layout::TensorNCxHWx<InterleavedK>,
-  ElementB,
-  layout::TensorCxRSKx<InterleavedK>,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassTensorOp,
-  ArchTag,
-  ThreadblockShape0,
-  ThreadblockShape1,
-  WarpShape0,
-  WarpShape1,
-  InstructionShape,
-  EpilogueOutputOp0,
-  EpilogueOutputOp1,
-  ThreadblockSwizzle,
-  Stages,
-  MathOperatorTag,
-  IteratorAlgorithm::kAnalytic
-> {
-
-  // Define the core components from GEMM
-  using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape0, WarpShape0, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
-      ElementB, layout::RowMajorInterleaved<InterleavedK>,
-      ElementAccumulator, LayoutC, arch::OpClassTensorOp,
-      Stages, MathOperatorTag, true>;
-  using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape1, WarpShape1, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
-      ElementB, layout::RowMajorInterleaved<InterleavedK>,
-      ElementAccumulator, LayoutC, arch::OpClassTensorOp,
-      Stages, MathOperatorTag, true>;
-
-  // Define iterators over tiles from the A operand
-  // Note GEMM shared memory threadmap is used here because conv global memory
-  // layout needs to be mapped to fprop which is similar to the crosswise
-  // layout which is used by the interleaved GEMM shared memory threadmap.
-  // The Interleaved GEMM global memory layout is similar to the congruous
-  // layout.
-  using ThreadMapA0 = typename MmaCore0::SmemThreadMapA;
-  using IteratorA0 =
-    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
-      cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
-      ElementA, layout::TensorNCxHWx<InterleavedK>,
-      ThreadMapA0
-    >;
-
-  using SmemIteratorA0 = typename MmaCore0::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  // Note GEMM shared memory threadmap is used here because conv global memory
-  // layout needs to be mapped to fprop which is similar to the crosswise
-  // layout which is used by the interleaved GEMM shared memory threadmap.
-  // The Interleaved GEMM global memory layout is similar to the congruous
-  // layout.
-  using ThreadMapB0 = typename MmaCore0::SmemThreadMapB;
-  using IteratorB0 =
-    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
-      cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
-      ElementB, layout::TensorCxRSKx<InterleavedK>,
-      ThreadMapB0
-    >;
-  
-  using SmemIteratorB0 = typename MmaCore0::SmemIteratorB;
-
-  // Use fragment iterator for A operand
-  using AccumulatorLayout = cutlass::layout::RowMajor;
-  using FragmentIteratorA1 = 
-      cutlass::gemm::warp::MmaTensorOpFragmentIterator<
-          cutlass::MatrixShape<MmaCore1::WarpShape::kM, MmaCore1::InstructionShape::kK>, //warp shape
-          cutlass::MatrixShape<MmaCore0::WarpShape::kM, MmaCore0::WarpShape::kN>, //accumulator shape
-          MmaCore1::Shape::kK, //kBlocksColumn
-          ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp0>;
-
-  /// Define iterators over tiles from scale/bias vectors
-  using ElementScaleBias = typename EpilogueOutputOp0::ElementCompute;
-  using LayoutScaleBias = layout::RowMajor; //vector layout doesn't really matter
-  static int const kElementsPerAccess = 4;
-  using IteratorAccumulatorScaleBias =
-    cutlass::transform::threadblock::VectorIterator<
-      cutlass::transform::threadblock::PredicatedVectorAccessIterator<
-          cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kN>, 
-          cutlass::MatrixShape<WarpShape1::kM, WarpShape1::kK>, 
-          ElementScaleBias, LayoutScaleBias, kElementsPerAccess>
-    >;
-
-  // Warp-level iterators to load scale and bias vectors
-  using FragmentIteratorA1ScaleBias = cutlass::transform::warp::VectorFragmentIterator<
-      MatrixShape<1, IteratorAccumulatorScaleBias::Fragment::kElements>, ElementScaleBias,
-      LayoutScaleBias, InstructionShape, kElementsPerAccess>;
-
-  using ThreadMapB1 = typename MmaCore1::SmemThreadMapB;
-  using IteratorB1 =
-    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
-      cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
-      ElementB, layout::TensorCxRSKx<InterleavedK>,
-      ThreadMapB1
-    >;
- 
-  using SmemIteratorB1 = typename MmaCore1::SmemIteratorB;
-
-
-  // Warp-level GEMM components
-  using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp;
-  using MmaPolicy0 = typename MmaCore0::MmaPolicy;
-  using MmaPolicy1 = typename MmaCore1::MmaPolicy;
-
-  // Define the Mma
-  using B2bMma = threadblock::B2bImplicitGemmMultistage<
-    ThreadblockShape0,
-    IteratorA0,
-    SmemIteratorA0,
-    arch::CacheOperation::Always,
-    IteratorB0,
-    SmemIteratorB0,
-    arch::CacheOperation::Global,
-    ThreadblockShape1,
-    FragmentIteratorA1,
-    IteratorAccumulatorScaleBias,
-    FragmentIteratorA1ScaleBias,
-    IteratorB1,
-    SmemIteratorB1,
-    arch::CacheOperation::Global,
-    EpilogueOutputOp0,
-    MmaPolicy0,
-    MmaPolicy1,
-    Stages 
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultInterleavedConvEpilogue<
-    ThreadblockShape1,
-    WarpMmaTensorOp1,
-    1,
-    EpilogueOutputOp1,
-    EpilogueOutputOp1::kCount,
-    InterleavedK
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::B2bImplicitGemmConvolution<
-    B2bMma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kFprop
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv2dFprop specialzation for Analytic IteratorAlgorithm
-/// and 2 stage pipeline.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape0,
-  typename ThreadblockShape1,
-  typename WarpShape0,
-  typename WarpShape1,
-  typename InstructionShape,
-  typename EpilogueOutputOp0,
-  typename EpilogueOutputOp1,
-  typename ThreadblockSwizzle,
-  typename MathOperatorTag
->
-struct DefaultB2bConv2dFprop <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassTensorOp,
-  ArchTag,
-  ThreadblockShape0,
-  ThreadblockShape1,
-  WarpShape0,
-  WarpShape1,
-  InstructionShape,
-  EpilogueOutputOp0,
-  EpilogueOutputOp1,
-  ThreadblockSwizzle,
-  2,
-  MathOperatorTag,
-  IteratorAlgorithm::kAnalytic
-> {
-
-  // Define the core components from GEMM
-  using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape0, WarpShape0, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
-      2, MathOperatorTag>;
-  using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape1, WarpShape1, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
-      2, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA0 = typename MmaCore0::IteratorThreadMapA;
-  using IteratorA0 =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
-        cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
-        ElementA, LayoutA,
-        ThreadMapA0
-      >
-    >;
-
-  using SmemIteratorA0 = typename MmaCore0::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB0 = typename MmaCore0::IteratorThreadMapB;
-  using IteratorB0 =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
-        cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
-        ElementB, LayoutB,
-        ThreadMapB0
-      >
-    >;
-  
-  using SmemIteratorB0 = typename MmaCore0::SmemIteratorB;
-
-  // Use fragment iterator for A operand
-  using AccumulatorLayout = cutlass::layout::ColumnMajor;
-  using FragmentIteratorA1 = 
-      cutlass::gemm::warp::MmaTensorOpFragmentIterator<
-          cutlass::MatrixShape<MmaCore1::WarpShape::kM, MmaCore1::InstructionShape::kK>, //warp shape
-          cutlass::MatrixShape<MmaCore0::WarpShape::kM, MmaCore0::WarpShape::kN>, //accumulator shape
-          MmaCore1::Shape::kK, //kBlocksColumn
-          ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp0>;
-
-  /// Define iterators over tiles from scale/bias vectors
-  using ElementScaleBias = typename EpilogueOutputOp0::ElementCompute;
-  using LayoutScaleBias = layout::RowMajor; //vector layout doesn't really matter
-  static int const kElementsPerAccess = 2;
-  using IteratorAccumulatorScaleBias =
-    cutlass::transform::threadblock::VectorIterator<
-      cutlass::transform::threadblock::PredicatedVectorAccessIterator<
-          cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kN>,
-          cutlass::MatrixShape<WarpShape1::kM, WarpShape1::kK>,
-          ElementScaleBias, LayoutScaleBias, kElementsPerAccess>
-    >;
-
-  // Warp-level iterators to load scale and bias vectors
-  using FragmentIteratorA1ScaleBias = cutlass::transform::warp::VectorFragmentIterator<
-      MatrixShape<1, IteratorAccumulatorScaleBias::Fragment::kElements>, ElementScaleBias,
-      LayoutScaleBias, InstructionShape, kElementsPerAccess>;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB1 = typename MmaCore1::IteratorThreadMapB;
-  using IteratorB1 =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
-        cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
-        ElementB, LayoutB,
-        ThreadMapB1
-      >
-    >;
-  
-  using SmemIteratorB1 = typename MmaCore1::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp;
-  using MmaPolicy0 = typename MmaCore0::MmaPolicy;
-  using MmaPolicy1 = typename MmaCore1::MmaPolicy;
-
-  // Define the Mma
-  using B2bMma = threadblock::B2bImplicitGemmPipelined<
-    ThreadblockShape0,
-    IteratorA0,
-    SmemIteratorA0,
-    IteratorB0,
-    SmemIteratorB0,
-    ThreadblockShape1,
-    FragmentIteratorA1,
-    IteratorAccumulatorScaleBias,
-    FragmentIteratorA1ScaleBias,
-    IteratorB1,
-    SmemIteratorB1,
-    ElementC,
-    LayoutC,
-    EpilogueOutputOp0,
-    MmaPolicy0,
-    MmaPolicy1
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename detail::DefaultConvEpilogue<
-    ArchTag,
-    ThreadblockShape1,
-    WarpMmaTensorOp1,
-    1,
-    EpilogueOutputOp1
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::B2bImplicitGemmConvolution<
-    B2bMma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kFprop
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv2dFprop specialzation for Analytic IteratorAlgorithm
-/// and 2 stage pipeline.
-/// Accumulator will be staged in shared memory.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape0,
-  typename ThreadblockShape1,
-  typename WarpShape0,
-  typename WarpShape1,
-  typename InstructionShape,
-  typename EpilogueOutputOp0,
-  typename EpilogueOutputOp1,
-  typename ThreadblockSwizzle,
-  typename MathOperatorTag
->
-struct DefaultB2bConv2dFprop <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassTensorOp,
-  ArchTag,
-  ThreadblockShape0,
-  ThreadblockShape1,
-  WarpShape0,
-  WarpShape1,
-  InstructionShape,
-  EpilogueOutputOp0,
-  EpilogueOutputOp1,
-  ThreadblockSwizzle,
-  2,
-  MathOperatorTag,
-  IteratorAlgorithm::kAnalytic,
-  true
-> {
-
-  // Define the core components from GEMM
-  using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape0, WarpShape0, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
-      2, MathOperatorTag>;
-  using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape1, WarpShape1, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
-      2, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA0 = typename MmaCore0::IteratorThreadMapA;
-  using IteratorA0 =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
-        cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
-        ElementA, LayoutA,
-        ThreadMapA0
-      >
-    >;
-
-  using SmemIteratorA0 = typename MmaCore0::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB0 = typename MmaCore0::IteratorThreadMapB;
-  using IteratorB0 =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
-        cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
-        ElementB, LayoutB,
-        ThreadMapB0
-      >
-    >;
-  
-  using SmemIteratorB0 = typename MmaCore0::SmemIteratorB;
-
-  /// Define iterators over tiles from scale/bias vectors
-  using ElementScaleBias = typename EpilogueOutputOp0::ElementCompute;
-  using LayoutScaleBias = layout::RowMajor; //vector layout doesn't really matter
-  static int const kElementsPerAccess = 2;
-  using IteratorAccumulatorScaleBias =
-    cutlass::transform::threadblock::VectorIterator<
-      cutlass::transform::threadblock::PredicatedVectorAccessIterator<
-          cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kN>, 
-          cutlass::MatrixShape<WarpShape0::kM, WarpShape0::kN>, 
-          ElementScaleBias, LayoutScaleBias, kElementsPerAccess>
-    >;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB1 = typename MmaCore1::IteratorThreadMapB;
-  using IteratorB1 =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
-        cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
-        ElementB, LayoutB,
-        ThreadMapB1
-      >
-    >;
-  
-  using SmemIteratorB1 = typename MmaCore1::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaTensorOp0 = typename MmaCore0::MmaTensorOp;
-  using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp;
-  using MmaPolicy0 = typename MmaCore0::MmaPolicy;
-  using MmaPolicy1 = typename MmaCore1::MmaPolicy;
-
-  // Use fragment iterator for the accumulator
-  using SmemAccumulatorLayout = cutlass::layout::RowMajor;
-  using FragmentIteratorAccumulator = cutlass::epilogue::warp::FragmentIteratorTensorOp<
-          WarpShape0, InstructionShape,
-          ElementAccumulator,
-          typename WarpMmaTensorOp0::Policy::Operator::FragmentC,
-          SmemAccumulatorLayout
-        >;
-
-  // Store Accumulator tiles to Shared Memory
-  using SmemIteratorD0 = 
-      cutlass::epilogue::warp::TileIteratorTensorOp<
-          WarpShape0,
-          InstructionShape,
-          ElementC,
-          SmemAccumulatorLayout
-        >;
-
-  static int const kThreadCount = 32;
-  // load warp tile from Shared Memory accumulator
-  using WarpIteratorA1 = cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator<
-    MatrixShape<WarpShape1::kM, InstructionShape::kK>, cutlass::gemm::Operand::kA, 
-    ElementA, SmemAccumulatorLayout,
-    MatrixShape<InstructionShape::kM, InstructionShape::kK>,
-    WarpMmaTensorOp1::Policy::OpDelta::kRow, kThreadCount>;
- 
-  // Define the Mma
-  using B2bMma = threadblock::B2bImplicitGemmPipelinedSmemAccumulator<
-    ThreadblockShape0,
-    IteratorA0,
-    SmemIteratorA0,
-    IteratorB0,
-    SmemIteratorB0,
-    IteratorAccumulatorScaleBias,
-    FragmentIteratorAccumulator,
-    SmemIteratorD0,
-    ThreadblockShape1,
-    WarpIteratorA1,
-    IteratorB1,
-    SmemIteratorB1,
-    ElementC,
-    LayoutC,
-    EpilogueOutputOp0,
-    MmaPolicy0,
-    MmaPolicy1
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename detail::DefaultConvEpilogue<
-    ArchTag,
-    ThreadblockShape1,
-    WarpMmaTensorOp1,
-    1,
-    EpilogueOutputOp1
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::B2bImplicitGemmConvolution<
-    B2bMma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kFprop
-  >;
-};
-
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv2dFprop specialzation for Analytic IteratorAlgorithm and 2 stage 
-/// pipeline with interleaved layout.
-template <
-  typename ElementA,
-  typename ElementB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape0,
-  typename ThreadblockShape1,
-  typename WarpShape0,
-  typename WarpShape1,
-  typename InstructionShape,
-  typename EpilogueOutputOp0,
-  typename EpilogueOutputOp1,
-  typename ThreadblockSwizzle,
-  typename MathOperatorTag,
-  int InterleavedK
->
-struct DefaultB2bConv2dFprop <
-  ElementA,
-  layout::TensorNCxHWx<InterleavedK>,
-  ElementB,
-  layout::TensorCxRSKx<InterleavedK>,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassTensorOp,
-  ArchTag,
-  ThreadblockShape0,
-  ThreadblockShape1,
-  WarpShape0,
-  WarpShape1,
-  InstructionShape,
-  EpilogueOutputOp0,
-  EpilogueOutputOp1,
-  ThreadblockSwizzle,
-  2,
-  MathOperatorTag,
-  IteratorAlgorithm::kAnalytic,
-  false
-> {
-
-  // Define the core components from GEMM
-  using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape0, WarpShape0, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
-      ElementB, layout::RowMajorInterleaved<InterleavedK>, 
-      ElementAccumulator, LayoutC, arch::OpClassTensorOp,
-      2, MathOperatorTag, true>;
-  using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape1, WarpShape1, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
-      ElementB, layout::RowMajorInterleaved<InterleavedK>, 
-      ElementAccumulator, LayoutC, arch::OpClassTensorOp,
-      2, MathOperatorTag, true>;
-
-  // Define iterators over tiles from the A operand
-  // Note GEMM shared memory threadmap is used here because conv global memory
-  // layout needs to be mapped to fprop which is similar to the crosswise
-  // layout which is used by the interleaved GEMM shared memory threadmap.
-  // The Interleaved GEMM global memory layout is similar to the congruous
-  // layout.
-  using ThreadMapA0 = typename MmaCore0::SmemThreadMapA;
-  using IteratorA0 =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
-        cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
-        ElementA, layout::TensorNCxHWx<InterleavedK>,
-        ThreadMapA0
-      >
-    >;
-
-  using SmemIteratorA0 = typename MmaCore0::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  // Note GEMM shared memory threadmap is used here because conv global memory
-  // layout needs to be mapped to fprop which is similar to the crosswise
-  // layout which is used by the interleaved GEMM shared memory threadmap.
-  // The Interleaved GEMM global memory layout is similar to the congruous
-  // layout.
-  using ThreadMapB0 = typename MmaCore0::SmemThreadMapB;
-  using IteratorB0 =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
-        cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
-        ElementB, layout::TensorCxRSKx<InterleavedK>,
-        ThreadMapB0
-      >
-    >;
-  
-  using SmemIteratorB0 = typename MmaCore0::SmemIteratorB;
-
-  // Use fragment iterator for A operand
-  using AccumulatorLayout = cutlass::layout::RowMajor;
-  using FragmentIteratorA1 = 
-      cutlass::gemm::warp::MmaTensorOpFragmentIterator<
-          cutlass::MatrixShape<MmaCore1::WarpShape::kM, MmaCore1::InstructionShape::kK>, //warp shape
-          cutlass::MatrixShape<MmaCore0::WarpShape::kM, MmaCore0::WarpShape::kN>, //accumulator shape
-          MmaCore1::Shape::kK, //kBlocksColumn
-          ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp0>;
-
-  /// Define iterators over tiles from scale/bias vectors
-  using ElementScaleBias = typename EpilogueOutputOp0::ElementCompute;
-  using LayoutScaleBias = layout::RowMajor; //vector layout doesn't really matter
-  static int const kElementsPerAccess = 4;
-  using IteratorAccumulatorScaleBias =
-    cutlass::transform::threadblock::VectorIterator<
-      cutlass::transform::threadblock::PredicatedVectorAccessIterator<
-          cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kN>, 
-          cutlass::MatrixShape<WarpShape1::kM, WarpShape1::kK>, 
-          ElementScaleBias, LayoutScaleBias, kElementsPerAccess>
-    >;
-
-  // Warp-level iterators to load scale and bias vectors
-  using FragmentIteratorA1ScaleBias = cutlass::transform::warp::VectorFragmentIterator<
-      MatrixShape<1, IteratorAccumulatorScaleBias::Fragment::kElements>, ElementScaleBias,
-      LayoutScaleBias, InstructionShape, kElementsPerAccess>;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB1 = typename MmaCore1::SmemThreadMapB;
-  using IteratorB1 =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
-        cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
-        ElementB, layout::TensorCxRSKx<InterleavedK>,
-        ThreadMapB1
-      >
-    >;
-  
-  using SmemIteratorB1 = typename MmaCore1::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp;
-  using MmaPolicy0 = typename MmaCore0::MmaPolicy;
-  using MmaPolicy1 = typename MmaCore1::MmaPolicy;
-
-  // Define the Mma
-  using B2bMma = threadblock::B2bImplicitGemmPipelined<
-    ThreadblockShape0,
-    IteratorA0,
-    SmemIteratorA0,
-    IteratorB0,
-    SmemIteratorB0,
-    ThreadblockShape1,
-    FragmentIteratorA1,
-    IteratorAccumulatorScaleBias,
-    FragmentIteratorA1ScaleBias,
-    IteratorB1,
-    SmemIteratorB1,
-    ElementC,
-    LayoutC,
-    EpilogueOutputOp0,
-    MmaPolicy0,
-    MmaPolicy1
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultInterleavedConvEpilogue<
-    ThreadblockShape1,
-    WarpMmaTensorOp1,
-    1,
-    EpilogueOutputOp1,
-    EpilogueOutputOp1::kCount,
-    InterleavedK
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::B2bImplicitGemmConvolution<
-    B2bMma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kFprop
-  >;
-};
-
-/// Defines a kernel for Conv2dFprop specialzation for Analytic IteratorAlgorithm and 2 stage 
-/// pipeline with interleaved layout.
-/// Accumulator will be staged in shared memory.
-template <
-  typename ElementA,
-  typename ElementB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape0,
-  typename ThreadblockShape1,
-  typename WarpShape0,
-  typename WarpShape1,
-  typename InstructionShape,
-  typename EpilogueOutputOp0,
-  typename EpilogueOutputOp1,
-  typename ThreadblockSwizzle,
-  typename MathOperatorTag,
-  int InterleavedK
->
-struct DefaultB2bConv2dFprop <
-  ElementA,
-  layout::TensorNCxHWx<InterleavedK>,
-  ElementB,
-  layout::TensorCxRSKx<InterleavedK>,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassTensorOp,
-  ArchTag,
-  ThreadblockShape0,
-  ThreadblockShape1,
-  WarpShape0,
-  WarpShape1,
-  InstructionShape,
-  EpilogueOutputOp0,
-  EpilogueOutputOp1,
-  ThreadblockSwizzle,
-  2,
-  MathOperatorTag,
-  IteratorAlgorithm::kAnalytic,
-  true
-> {
-
-  // Define the core components from GEMM
-  using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape0, WarpShape0, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
-      ElementB, layout::RowMajorInterleaved<InterleavedK>, 
-      ElementAccumulator, LayoutC, arch::OpClassTensorOp,
-      2, MathOperatorTag, true>;
-  using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape1, WarpShape1, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
-      ElementB, layout::RowMajorInterleaved<InterleavedK>, 
-      ElementAccumulator, LayoutC, arch::OpClassTensorOp,
-      2, MathOperatorTag, true>;
-
-  // Define iterators over tiles from the A operand
-  // Note GEMM shared memory threadmap is used here because conv global memory
-  // layout needs to be mapped to fprop which is similar to the crosswise
-  // layout which is used by the interleaved GEMM shared memory threadmap.
-  // The Interleaved GEMM global memory layout is similar to the congruous
-  // layout.
-  using ThreadMapA0 = typename MmaCore0::SmemThreadMapA;
-  using IteratorA0 =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
-        cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
-        ElementA, layout::TensorNCxHWx<InterleavedK>,
-        ThreadMapA0
-      >
-    >;
-
-  using SmemIteratorA0 = typename MmaCore0::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  // Note GEMM shared memory threadmap is used here because conv global memory
-  // layout needs to be mapped to fprop which is similar to the crosswise
-  // layout which is used by the interleaved GEMM shared memory threadmap.
-  // The Interleaved GEMM global memory layout is similar to the congruous
-  // layout.
-  using ThreadMapB0 = typename MmaCore0::SmemThreadMapB;
-  using IteratorB0 =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
-        cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
-        ElementB, layout::TensorCxRSKx<InterleavedK>,
-        ThreadMapB0
-      >
-    >;
-  
-  using SmemIteratorB0 = typename MmaCore0::SmemIteratorB;
-
-  /// Define iterators over tiles from scale/bias vectors
-  using ElementScaleBias = typename EpilogueOutputOp0::ElementCompute;
-  using LayoutScaleBias = layout::RowMajor; //vector layout doesn't really matter
-  static int const kElementsPerAccess = 4; //For interleaved layout
-  using IteratorAccumulatorScaleBias =
-    cutlass::transform::threadblock::VectorIterator<
-      cutlass::transform::threadblock::PredicatedVectorAccessIterator<
-          cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kN>, 
-          cutlass::MatrixShape<WarpShape0::kM, WarpShape0::kN>, 
-          ElementScaleBias, LayoutScaleBias, kElementsPerAccess>
-    >;
-
- // Define iterators over tiles from the B operand
-  using ThreadMapB1 = typename MmaCore1::SmemThreadMapB;
-  using IteratorB1 =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
-        cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
-        ElementB, layout::TensorCxRSKx<InterleavedK>,
-        ThreadMapB1
-      >
-    >;
-  
-  using SmemIteratorB1 = typename MmaCore1::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaTensorOp0 = typename MmaCore0::MmaTensorOp;
-  using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp;
-  using MmaPolicy0 = typename MmaCore0::MmaPolicy;
-  using MmaPolicy1 = typename MmaCore1::MmaPolicy;
-
-  // Use fragment iterator for the accumulator
-  using SmemAccumulatorLayout = cutlass::layout::ColumnMajorInterleaved<16>;
-  using FragmentIteratorAccumulator = cutlass::epilogue::warp::FragmentIteratorTensorOp<
-          WarpShape0, InstructionShape,
-          ElementAccumulator,
-          typename WarpMmaTensorOp0::Policy::Operator::FragmentC,
-          SmemAccumulatorLayout
-        >;
-
-
-  // Store Accumulator tiles to Shared Memory
-  using SmemIteratorD0 = 
-      cutlass::epilogue::warp::TileIteratorTensorOp<
-          WarpShape0,
-          InstructionShape,
-          ElementC,
-          SmemAccumulatorLayout
-        >;
-
-  static int const kThreadCount = 32;
-  // load warp tile from Shared Memory accumulator
-  using WarpIteratorA1 = cutlass::gemm::warp::MmaTensorOpMultiplicandTileIteratorCanonical<
-    MatrixShape<WarpShape1::kM, InstructionShape::kK>, cutlass::gemm::Operand::kA, 
-    ElementA, SmemAccumulatorLayout,
-    MatrixShape<InstructionShape::kM, InstructionShape::kK>,
-    WarpMmaTensorOp1::Policy::OpDelta::kRow, kThreadCount>;
- 
-  // Define the Mma
-  using B2bMma = threadblock::B2bImplicitGemmPipelinedSmemAccumulator<
-    ThreadblockShape0,
-    IteratorA0,
-    SmemIteratorA0,
-    IteratorB0,
-    SmemIteratorB0,
-    IteratorAccumulatorScaleBias,
-    FragmentIteratorAccumulator,
-    SmemIteratorD0,
-    ThreadblockShape1,
-    WarpIteratorA1,
-    IteratorB1,
-    SmemIteratorB1,
-    ElementC,
-    LayoutC,
-    EpilogueOutputOp0,
-    MmaPolicy0,
-    MmaPolicy1
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultInterleavedConvEpilogue<
-    ThreadblockShape1,
-    WarpMmaTensorOp1,
-    1,
-    EpilogueOutputOp1,
-    EpilogueOutputOp1::kCount,
-    InterleavedK
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::B2bImplicitGemmConvolution<
-    B2bMma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kFprop
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv2dFprop specialzation for Optimized IteratorAlgorithm and 
-/// multistage pipeline.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape0,
-  typename ThreadblockShape1,
-  typename WarpShape0,
-  typename WarpShape1,
-  typename InstructionShape,
-  typename EpilogueOutputOp0,
-  typename EpilogueOutputOp1,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag
->
-struct DefaultB2bConv2dFprop <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassTensorOp,
-  ArchTag,
-  ThreadblockShape0,
-  ThreadblockShape1,
-  WarpShape0,
-  WarpShape1,
-  InstructionShape,
-  EpilogueOutputOp0,
-  EpilogueOutputOp1,
-  ThreadblockSwizzle,
-  Stages,
-  MathOperatorTag,
-  IteratorAlgorithm::kOptimized
-> {
-
-  // Define the core components from GEMM
-  using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape0, WarpShape0, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
-      Stages, MathOperatorTag>;
-  using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape1, WarpShape1, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
-      Stages, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA0 = typename MmaCore0::IteratorThreadMapA;
-  using IteratorA0 =
-    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
-      cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
-      ElementA, LayoutA,
-      ThreadMapA0
-    >;
-
-  using SmemIteratorA0 = typename MmaCore0::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB0 = typename MmaCore0::IteratorThreadMapB;
-  using IteratorB0 =
-    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
-      cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
-      ElementB, LayoutB,
-      ThreadMapB0
-    >;
-  
-  using SmemIteratorB0 = typename MmaCore0::SmemIteratorB;
-
-  // Use fragment iterator for A operand
-  using AccumulatorLayout = cutlass::layout::ColumnMajor;
-  using FragmentIteratorA1 = 
-      cutlass::gemm::warp::MmaTensorOpFragmentIterator<
-          cutlass::MatrixShape<MmaCore1::WarpShape::kM, MmaCore1::InstructionShape::kK>, //warp shape
-          cutlass::MatrixShape<MmaCore0::WarpShape::kM, MmaCore0::WarpShape::kN>, //accumulator shape
-          MmaCore1::Shape::kK, //kBlocksColumn
-          ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp0>;
-
-  /// Define iterators over tiles from scale/bias vectors
-  using ElementScaleBias = typename EpilogueOutputOp0::ElementCompute;
-  using LayoutScaleBias = layout::RowMajor; //vector layout doesn't really matter
-  static int const kElementsPerAccess = 2;
-  using IteratorAccumulatorScaleBias =
-    cutlass::transform::threadblock::VectorIterator<
-      cutlass::transform::threadblock::PredicatedVectorAccessIterator<
-          cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kN>,
-          cutlass::MatrixShape<WarpShape1::kM, WarpShape1::kK>,
-          ElementScaleBias, LayoutScaleBias, kElementsPerAccess>
-    >;
-
-  // Warp-level iterators to load scale and bias vectors
-  using FragmentIteratorA1ScaleBias = cutlass::transform::warp::VectorFragmentIterator<
-      MatrixShape<1, IteratorAccumulatorScaleBias::Fragment::kElements>, ElementScaleBias,
-      LayoutScaleBias, InstructionShape, kElementsPerAccess>;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB1 = typename MmaCore1::IteratorThreadMapB;
-  using IteratorB1 =
-    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
-      cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
-      ElementB, LayoutB,
-      ThreadMapB1
-    >;
-  
-  using SmemIteratorB1 = typename MmaCore1::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp;
-  using MmaPolicy0 = typename MmaCore0::MmaPolicy;
-  using MmaPolicy1 = typename MmaCore1::MmaPolicy;
-
-  // Define the Mma
-  using B2bMma = threadblock::B2bImplicitGemmMultistage<
-    ThreadblockShape0,
-    IteratorA0,
-    SmemIteratorA0,
-    arch::CacheOperation::Always,
-    IteratorB0,
-    SmemIteratorB0,
-    arch::CacheOperation::Global,
-    ThreadblockShape1,
-    FragmentIteratorA1,
-    IteratorAccumulatorScaleBias,
-    FragmentIteratorA1ScaleBias,
-    IteratorB1,
-    SmemIteratorB1,
-    arch::CacheOperation::Global,
-    EpilogueOutputOp0,
-    MmaPolicy0,
-    MmaPolicy1,
-    Stages 
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
-    ThreadblockShape1,
-    WarpMmaTensorOp1,
-    1,
-    EpilogueOutputOp1,
-    EpilogueOutputOp1::kCount
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::B2bImplicitGemmConvolution<
-    B2bMma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kFprop
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv2dFprop specialzation for Optimzed IteratorAlgorithm and 
-// multistage pipeline with interleaved layout.
-template <
-  typename ElementA,
-  typename ElementB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape0,
-  typename ThreadblockShape1,
-  typename WarpShape0,
-  typename WarpShape1,
-  typename InstructionShape,
-  typename EpilogueOutputOp0,
-  typename EpilogueOutputOp1,
-  typename ThreadblockSwizzle,
-  int Stages,
-  typename MathOperatorTag,
-  int InterleavedK
->
-struct DefaultB2bConv2dFprop <
-  ElementA,
-  layout::TensorNCxHWx<InterleavedK>,
-  ElementB,
-  layout::TensorCxRSKx<InterleavedK>,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassTensorOp,
-  ArchTag,
-  ThreadblockShape0,
-  ThreadblockShape1,
-  WarpShape0,
-  WarpShape1,
-  InstructionShape,
-  EpilogueOutputOp0,
-  EpilogueOutputOp1,
-  ThreadblockSwizzle,
-  Stages,
-  MathOperatorTag,
-  IteratorAlgorithm::kOptimized
-> {
-
-  // Define the core components from GEMM
-  using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape0, WarpShape0, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
-      ElementB, layout::RowMajorInterleaved<InterleavedK>,
-      ElementAccumulator, LayoutC, arch::OpClassTensorOp,
-      Stages, MathOperatorTag, true>;
-  using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape1, WarpShape1, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
-      ElementB, layout::RowMajorInterleaved<InterleavedK>,
-      ElementAccumulator, LayoutC, arch::OpClassTensorOp,
-      Stages, MathOperatorTag, true>;
-
-  // Define iterators over tiles from the A operand
-  // Note GEMM shared memory threadmap is used here because conv global memory
-  // layout needs to be mapped to fprop which is similar to the crosswise
-  // layout which is used by the interleaved GEMM shared memory threadmap.
-  // The Interleaved GEMM global memory layout is similar to the congruous
-  // layout.
-  using ThreadMapA0 = typename MmaCore0::SmemThreadMapA;
-  using IteratorA0 =
-    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
-      cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
-      ElementA, layout::TensorNCxHWx<InterleavedK>,
-      ThreadMapA0
-    >;
-
-  using SmemIteratorA0 = typename MmaCore0::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  // Note GEMM shared memory threadmap is used here because conv global memory
-  // layout needs to be mapped to fprop which is similar to the crosswise
-  // layout which is used by the interleaved GEMM shared memory threadmap.
-  // The Interleaved GEMM global memory layout is similar to the congruous
-  // layout.
-  using ThreadMapB0 = typename MmaCore0::SmemThreadMapB;
-  using IteratorB0 =
-    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
-      cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
-      ElementB, layout::TensorCxRSKx<InterleavedK>,
-      ThreadMapB0
-    >;
-  
-  using SmemIteratorB0 = typename MmaCore0::SmemIteratorB;
-
-  // Use fragment iterator for A operand
-  using AccumulatorLayout = cutlass::layout::RowMajor;
-  using FragmentIteratorA1 = 
-      cutlass::gemm::warp::MmaTensorOpFragmentIterator<
-          cutlass::MatrixShape<MmaCore1::WarpShape::kM, MmaCore1::InstructionShape::kK>, //warp shape
-          cutlass::MatrixShape<MmaCore0::WarpShape::kM, MmaCore0::WarpShape::kN>, //accumulator shape
-          MmaCore1::Shape::kK, //kBlocksColumn
-          ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp0>;
-
-  /// Define iterators over tiles from scale/bias vectors
-  using ElementScaleBias = typename EpilogueOutputOp0::ElementCompute;
-  using LayoutScaleBias = layout::RowMajor; //vector layout doesn't really matter
-  static int const kElementsPerAccess = 4;
-  using IteratorAccumulatorScaleBias =
-    cutlass::transform::threadblock::VectorIterator<
-      cutlass::transform::threadblock::PredicatedVectorAccessIterator<
-          cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kN>, 
-          cutlass::MatrixShape<WarpShape1::kM, WarpShape1::kK>, 
-          ElementScaleBias, LayoutScaleBias, kElementsPerAccess>
-    >;
-
-  // Warp-level iterators to load scale and bias vectors
-  using FragmentIteratorA1ScaleBias = cutlass::transform::warp::VectorFragmentIterator<
-      MatrixShape<1, IteratorAccumulatorScaleBias::Fragment::kElements>, ElementScaleBias,
-      LayoutScaleBias, InstructionShape, kElementsPerAccess>;
-
-  using ThreadMapB1 = typename MmaCore1::SmemThreadMapB;
-  using IteratorB1 =
-    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
-      cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
-      ElementB, layout::TensorCxRSKx<InterleavedK>,
-      ThreadMapB1
-    >;
- 
-  using SmemIteratorB1 = typename MmaCore1::SmemIteratorB;
-
-
-  // Warp-level GEMM components
-  using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp;
-  using MmaPolicy0 = typename MmaCore0::MmaPolicy;
-  using MmaPolicy1 = typename MmaCore1::MmaPolicy;
-
-  // Define the Mma
-  using B2bMma = threadblock::B2bImplicitGemmMultistage<
-    ThreadblockShape0,
-    IteratorA0,
-    SmemIteratorA0,
-    arch::CacheOperation::Always,
-    IteratorB0,
-    SmemIteratorB0,
-    arch::CacheOperation::Global,
-    ThreadblockShape1,
-    FragmentIteratorA1,
-    IteratorAccumulatorScaleBias,
-    FragmentIteratorA1ScaleBias,
-    IteratorB1,
-    SmemIteratorB1,
-    arch::CacheOperation::Global,
-    EpilogueOutputOp0,
-    MmaPolicy0,
-    MmaPolicy1,
-    Stages 
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultInterleavedConvEpilogue<
-    ThreadblockShape1,
-    WarpMmaTensorOp1,
-    1,
-    EpilogueOutputOp1,
-    EpilogueOutputOp1::kCount,
-    InterleavedK
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::B2bImplicitGemmConvolution<
-    B2bMma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kFprop
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv2dFprop specialzation for Optimized IteratorAlgorithm
-/// and 2 stage pipeline.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape0,
-  typename ThreadblockShape1,
-  typename WarpShape0,
-  typename WarpShape1,
-  typename InstructionShape,
-  typename EpilogueOutputOp0,
-  typename EpilogueOutputOp1,
-  typename ThreadblockSwizzle,
-  typename MathOperatorTag
->
-struct DefaultB2bConv2dFprop <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassTensorOp,
-  ArchTag,
-  ThreadblockShape0,
-  ThreadblockShape1,
-  WarpShape0,
-  WarpShape1,
-  InstructionShape,
-  EpilogueOutputOp0,
-  EpilogueOutputOp1,
-  ThreadblockSwizzle,
-  2,
-  MathOperatorTag,
-  IteratorAlgorithm::kOptimized
-> {
-
-  // Define the core components from GEMM
-  using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape0, WarpShape0, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
-      2, MathOperatorTag>;
-  using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape1, WarpShape1, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
-      2, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA0 = typename MmaCore0::IteratorThreadMapA;
-  using IteratorA0 =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
-        cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
-        ElementA, LayoutA,
-        ThreadMapA0
-      >
-    >;
-
-  using SmemIteratorA0 = typename MmaCore0::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB0 = typename MmaCore0::IteratorThreadMapB;
-  using IteratorB0 =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
-        cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
-        ElementB, LayoutB,
-        ThreadMapB0
-      >
-    >;
-  
-  using SmemIteratorB0 = typename MmaCore0::SmemIteratorB;
-
-  // Use fragment iterator for A operand
-  using AccumulatorLayout = cutlass::layout::ColumnMajor;
-  using FragmentIteratorA1 = 
-      cutlass::gemm::warp::MmaTensorOpFragmentIterator<
-          cutlass::MatrixShape<MmaCore1::WarpShape::kM, MmaCore1::InstructionShape::kK>, //warp shape
-          cutlass::MatrixShape<MmaCore0::WarpShape::kM, MmaCore0::WarpShape::kN>, //accumulator shape
-          MmaCore1::Shape::kK, //kBlocksColumn
-          ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp0>;
-
-  /// Define iterators over tiles from scale/bias vectors
-  using ElementScaleBias = typename EpilogueOutputOp0::ElementCompute;
-  using LayoutScaleBias = layout::RowMajor; //vector layout doesn't really matter
-  static int const kElementsPerAccess = 2;
-  using IteratorAccumulatorScaleBias =
-    cutlass::transform::threadblock::VectorIterator<
-      cutlass::transform::threadblock::PredicatedVectorAccessIterator<
-          cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kN>,
-          cutlass::MatrixShape<WarpShape1::kM, WarpShape1::kK>,
-          ElementScaleBias, LayoutScaleBias, kElementsPerAccess>
-    >;
-
-  // Warp-level iterators to load scale and bias vectors
-  using FragmentIteratorA1ScaleBias = cutlass::transform::warp::VectorFragmentIterator<
-      MatrixShape<1, IteratorAccumulatorScaleBias::Fragment::kElements>, ElementScaleBias,
-      LayoutScaleBias, InstructionShape, kElementsPerAccess>;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB1 = typename MmaCore1::IteratorThreadMapB;
-  using IteratorB1 =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
-        cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
-        ElementB, LayoutB,
-        ThreadMapB1
-      >
-    >;
-  
-  using SmemIteratorB1 = typename MmaCore1::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp;
-  using MmaPolicy0 = typename MmaCore0::MmaPolicy;
-  using MmaPolicy1 = typename MmaCore1::MmaPolicy;
-
-  // Define the Mma
-  using B2bMma = threadblock::B2bImplicitGemmPipelined<
-    ThreadblockShape0,
-    IteratorA0,
-    SmemIteratorA0,
-    IteratorB0,
-    SmemIteratorB0,
-    ThreadblockShape1,
-    FragmentIteratorA1,
-    IteratorAccumulatorScaleBias,
-    FragmentIteratorA1ScaleBias,
-    IteratorB1,
-    SmemIteratorB1,
-    ElementC,
-    LayoutC,
-    EpilogueOutputOp0,
-    MmaPolicy0,
-    MmaPolicy1
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename detail::DefaultConvEpilogue<
-    ArchTag,
-    ThreadblockShape1,
-    WarpMmaTensorOp1,
-    1,
-    EpilogueOutputOp1
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::B2bImplicitGemmConvolution<
-    B2bMma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kFprop
-  >;
-};
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv2dFprop specialzation for Optimized IteratorAlgorithm
-/// and 2 stage pipeline.
-/// Accumulator will be staged in shared memory.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape0,
-  typename ThreadblockShape1,
-  typename WarpShape0,
-  typename WarpShape1,
-  typename InstructionShape,
-  typename EpilogueOutputOp0,
-  typename EpilogueOutputOp1,
-  typename ThreadblockSwizzle,
-  typename MathOperatorTag
->
-struct DefaultB2bConv2dFprop <
-  ElementA,
-  LayoutA,
-  ElementB,
-  LayoutB,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassTensorOp,
-  ArchTag,
-  ThreadblockShape0,
-  ThreadblockShape1,
-  WarpShape0,
-  WarpShape1,
-  InstructionShape,
-  EpilogueOutputOp0,
-  EpilogueOutputOp1,
-  ThreadblockSwizzle,
-  2,
-  MathOperatorTag,
-  IteratorAlgorithm::kOptimized,
-  true
-> {
-
-  // Define the core components from GEMM
-  using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape0, WarpShape0, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
-      2, MathOperatorTag>;
-  using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape1, WarpShape1, InstructionShape, ElementA, layout::RowMajor,
-      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
-      2, MathOperatorTag>;
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA0 = typename MmaCore0::IteratorThreadMapA;
-  using IteratorA0 =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
-        cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
-        ElementA, LayoutA,
-        ThreadMapA0
-      >
-    >;
-
-  using SmemIteratorA0 = typename MmaCore0::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB0 = typename MmaCore0::IteratorThreadMapB;
-  using IteratorB0 =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
-        cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
-        ElementB, LayoutB,
-        ThreadMapB0
-      >
-    >;
-  
-  using SmemIteratorB0 = typename MmaCore0::SmemIteratorB;
-
-  /// Define iterators over tiles from scale/bias vectors
-  using ElementScaleBias = typename EpilogueOutputOp0::ElementCompute;
-  using LayoutScaleBias = layout::RowMajor; //vector layout doesn't really matter
-  static int const kElementsPerAccess = 2;
-  using IteratorAccumulatorScaleBias =
-    cutlass::transform::threadblock::VectorIterator<
-      cutlass::transform::threadblock::PredicatedVectorAccessIterator<
-          cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kN>, 
-          cutlass::MatrixShape<WarpShape0::kM, WarpShape0::kN>, 
-          ElementScaleBias, LayoutScaleBias, kElementsPerAccess>
-    >;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB1 = typename MmaCore1::IteratorThreadMapB;
-  using IteratorB1 =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
-        cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
-        ElementB, LayoutB,
-        ThreadMapB1
-      >
-    >;
-  
-  using SmemIteratorB1 = typename MmaCore1::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaTensorOp0 = typename MmaCore0::MmaTensorOp;
-  using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp;
-  using MmaPolicy0 = typename MmaCore0::MmaPolicy;
-  using MmaPolicy1 = typename MmaCore1::MmaPolicy;
-
-  // Use fragment iterator for the accumulator
-  using SmemAccumulatorLayout = cutlass::layout::RowMajor;
-  using FragmentIteratorAccumulator = cutlass::epilogue::warp::FragmentIteratorTensorOp<
-          WarpShape0, InstructionShape,
-          ElementAccumulator,
-          typename WarpMmaTensorOp0::Policy::Operator::FragmentC,
-          SmemAccumulatorLayout
-        >;
-
-  // Store Accumulator tiles to Shared Memory
-  using SmemIteratorD0 = 
-      cutlass::epilogue::warp::TileIteratorTensorOp<
-          WarpShape0,
-          InstructionShape,
-          ElementC,
-          SmemAccumulatorLayout
-        >;
-
-  static int const kThreadCount = 32;
-  // load warp tile from Shared Memory accumulator
-  using WarpIteratorA1 = cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator<
-    MatrixShape<WarpShape1::kM, InstructionShape::kK>, cutlass::gemm::Operand::kA, 
-    ElementA, SmemAccumulatorLayout,
-    MatrixShape<InstructionShape::kM, InstructionShape::kK>,
-    WarpMmaTensorOp1::Policy::OpDelta::kRow, kThreadCount>;
- 
-  // Define the Mma
-  using B2bMma = threadblock::B2bImplicitGemmPipelinedSmemAccumulator<
-    ThreadblockShape0,
-    IteratorA0,
-    SmemIteratorA0,
-    IteratorB0,
-    SmemIteratorB0,
-    IteratorAccumulatorScaleBias,
-    FragmentIteratorAccumulator,
-    SmemIteratorD0,
-    ThreadblockShape1,
-    WarpIteratorA1,
-    IteratorB1,
-    SmemIteratorB1,
-    ElementC,
-    LayoutC,
-    EpilogueOutputOp0,
-    MmaPolicy0,
-    MmaPolicy1
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename detail::DefaultConvEpilogue<
-    ArchTag,
-    ThreadblockShape1,
-    WarpMmaTensorOp1,
-    1,
-    EpilogueOutputOp1
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::B2bImplicitGemmConvolution<
-    B2bMma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kFprop
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines a kernel for Conv2dFprop specialzation for Optimized IteratorAlgorithm and 2 stage 
-/// pipeline with interleaved layout.
-template <
-  typename ElementA,
-  typename ElementB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape0,
-  typename ThreadblockShape1,
-  typename WarpShape0,
-  typename WarpShape1,
-  typename InstructionShape,
-  typename EpilogueOutputOp0,
-  typename EpilogueOutputOp1,
-  typename ThreadblockSwizzle,
-  typename MathOperatorTag,
-  int InterleavedK
->
-struct DefaultB2bConv2dFprop <
-  ElementA,
-  layout::TensorNCxHWx<InterleavedK>,
-  ElementB,
-  layout::TensorCxRSKx<InterleavedK>,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassTensorOp,
-  ArchTag,
-  ThreadblockShape0,
-  ThreadblockShape1,
-  WarpShape0,
-  WarpShape1,
-  InstructionShape,
-  EpilogueOutputOp0,
-  EpilogueOutputOp1,
-  ThreadblockSwizzle,
-  2,
-  MathOperatorTag,
-  IteratorAlgorithm::kOptimized
-> {
-
-  // Define the core components from GEMM
-  using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape0, WarpShape0, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
-      ElementB, layout::RowMajorInterleaved<InterleavedK>, 
-      ElementAccumulator, LayoutC, arch::OpClassTensorOp,
-      2, MathOperatorTag, true>;
-  using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape1, WarpShape1, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
-      ElementB, layout::RowMajorInterleaved<InterleavedK>, 
-      ElementAccumulator, LayoutC, arch::OpClassTensorOp,
-      2, MathOperatorTag, true>;
-
-  // Define iterators over tiles from the A operand
-  // Note GEMM shared memory threadmap is used here because conv global memory
-  // layout needs to be mapped to fprop which is similar to the crosswise
-  // layout which is used by the interleaved GEMM shared memory threadmap.
-  // The Interleaved GEMM global memory layout is similar to the congruous
-  // layout.
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA0 = typename MmaCore0::SmemThreadMapA;
-  using IteratorA0 =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
-        cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
-        ElementA, layout::TensorNCxHWx<InterleavedK>,
-        ThreadMapA0
-      >
-    >;
-
-  using SmemIteratorA0 = typename MmaCore0::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  using ThreadMapB0 = typename MmaCore0::SmemThreadMapB;
-  using IteratorB0 =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
-        cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
-        ElementB, layout::TensorCxRSKx<InterleavedK>,
-        ThreadMapB0
-      >
-    >;
-  
-  using SmemIteratorB0 = typename MmaCore0::SmemIteratorB;
-
-  // Use fragment iterator for A operand
-  using AccumulatorLayout = cutlass::layout::RowMajor;
-  using FragmentIteratorA1 = 
-      cutlass::gemm::warp::MmaTensorOpFragmentIterator<
-          cutlass::MatrixShape<MmaCore1::WarpShape::kM, MmaCore1::InstructionShape::kK>, //warp shape
-          cutlass::MatrixShape<MmaCore0::WarpShape::kM, MmaCore0::WarpShape::kN>, //accumulator shape
-          MmaCore1::Shape::kK, //kBlocksColumn
-          ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp0>;
-
-  /// Define iterators over tiles from scale/bias vectors
-  using ElementScaleBias = typename EpilogueOutputOp0::ElementCompute;
-  using LayoutScaleBias = layout::RowMajor; //vector layout doesn't really matter
-  static int const kElementsPerAccess = 4;
-  using IteratorAccumulatorScaleBias =
-    cutlass::transform::threadblock::VectorIterator<
-      cutlass::transform::threadblock::PredicatedVectorAccessIterator<
-          cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kN>, 
-          cutlass::MatrixShape<WarpShape1::kM, WarpShape1::kK>, 
-          ElementScaleBias, LayoutScaleBias, kElementsPerAccess>
-    >;
-
-  // Warp-level iterators to load scale and bias vectors
-  using FragmentIteratorA1ScaleBias = cutlass::transform::warp::VectorFragmentIterator<
-      MatrixShape<1, IteratorAccumulatorScaleBias::Fragment::kElements>, ElementScaleBias,
-      LayoutScaleBias, InstructionShape, kElementsPerAccess>;
-
-  using ThreadMapB1 = typename MmaCore1::SmemThreadMapB;
-  using IteratorB1 =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
-        cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
-        ElementB, layout::TensorCxRSKx<InterleavedK>,
-        ThreadMapB1
-      >
-    >;
-  
-  using SmemIteratorB1 = typename MmaCore1::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp;
-  using MmaPolicy0 = typename MmaCore0::MmaPolicy;
-  using MmaPolicy1 = typename MmaCore1::MmaPolicy;
-
-  // Define the Mma
-  using B2bMma = threadblock::B2bImplicitGemmPipelined<
-    ThreadblockShape0,
-    IteratorA0,
-    SmemIteratorA0,
-    IteratorB0,
-    SmemIteratorB0,
-    ThreadblockShape1,
-    FragmentIteratorA1,
-    IteratorAccumulatorScaleBias,
-    FragmentIteratorA1ScaleBias,
-    IteratorB1,
-    SmemIteratorB1,
-    ElementC,
-    LayoutC,
-    EpilogueOutputOp0,
-    MmaPolicy0,
-    MmaPolicy1
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultInterleavedConvEpilogue<
-    ThreadblockShape1,
-    WarpMmaTensorOp1,
-    1,
-    EpilogueOutputOp1,
-    EpilogueOutputOp1::kCount,
-    InterleavedK
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::B2bImplicitGemmConvolution<
-    B2bMma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kFprop
-  >;
-};
-
-/// Defines a kernel for Conv2dFprop specialzation for Optimized IteratorAlgorithm and 2 stage 
-/// pipeline with interleaved layout.
-/// Accumulator will be staged in shared memory.
-template <
-  typename ElementA,
-  typename ElementB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementAccumulator,
-  typename ArchTag,
-  typename ThreadblockShape0,
-  typename ThreadblockShape1,
-  typename WarpShape0,
-  typename WarpShape1,
-  typename InstructionShape,
-  typename EpilogueOutputOp0,
-  typename EpilogueOutputOp1,
-  typename ThreadblockSwizzle,
-  typename MathOperatorTag,
-  int InterleavedK
->
-struct DefaultB2bConv2dFprop <
-  ElementA,
-  layout::TensorNCxHWx<InterleavedK>,
-  ElementB,
-  layout::TensorCxRSKx<InterleavedK>,
-  ElementC,
-  LayoutC,
-  ElementAccumulator,
-  arch::OpClassTensorOp,
-  ArchTag,
-  ThreadblockShape0,
-  ThreadblockShape1,
-  WarpShape0,
-  WarpShape1,
-  InstructionShape,
-  EpilogueOutputOp0,
-  EpilogueOutputOp1,
-  ThreadblockSwizzle,
-  2,
-  MathOperatorTag,
-  IteratorAlgorithm::kOptimized,
-  true
-> {
-
-  // Define the core components from GEMM
-  using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape0, WarpShape0, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
-      ElementB, layout::RowMajorInterleaved<InterleavedK>, 
-      ElementAccumulator, LayoutC, arch::OpClassTensorOp,
-      2, MathOperatorTag, true>;
-  using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
-      ThreadblockShape1, WarpShape1, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
-      ElementB, layout::RowMajorInterleaved<InterleavedK>, 
-      ElementAccumulator, LayoutC, arch::OpClassTensorOp,
-      2, MathOperatorTag, true>;
-
-  // Define iterators over tiles from the A operand
-  // Note GEMM shared memory threadmap is used here because conv global memory
-  // layout needs to be mapped to fprop which is similar to the crosswise
-  // layout which is used by the interleaved GEMM shared memory threadmap.
-  // The Interleaved GEMM global memory layout is similar to the congruous
-  // layout.
-
-  // Define iterators over tiles from the A operand
-  using ThreadMapA0 = typename MmaCore0::SmemThreadMapA;
-  using IteratorA0 =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
-        cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
-        ElementA, layout::TensorNCxHWx<InterleavedK>,
-        ThreadMapA0
-      >
-    >;
-
-  using SmemIteratorA0 = typename MmaCore0::SmemIteratorA;
-
-  // Define iterators over tiles from the B operand
-  // Note GEMM shared memory threadmap is used here because conv global memory
-  // layout needs to be mapped to fprop which is similar to the crosswise
-  // layout which is used by the interleaved GEMM shared memory threadmap.
-  // The Interleaved GEMM global memory layout is similar to the congruous
-  // layout.
-  using ThreadMapB0 = typename MmaCore0::SmemThreadMapB;
-  using IteratorB0 =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
-        cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
-        ElementB, layout::TensorCxRSKx<InterleavedK>,
-        ThreadMapB0
-      >
-    >;
-  
-  using SmemIteratorB0 = typename MmaCore0::SmemIteratorB;
-
-  /// Define iterators over tiles from scale/bias vectors
-  using ElementScaleBias = typename EpilogueOutputOp0::ElementCompute;
-  using LayoutScaleBias = layout::RowMajor; //vector layout doesn't really matter
-  static int const kElementsPerAccess = 4; //For interleaved layout
-  using IteratorAccumulatorScaleBias =
-    cutlass::transform::threadblock::VectorIterator<
-      cutlass::transform::threadblock::PredicatedVectorAccessIterator<
-          cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kN>, 
-          cutlass::MatrixShape<WarpShape0::kM, WarpShape0::kN>, 
-          ElementScaleBias, LayoutScaleBias, kElementsPerAccess>
-    >;
-
-  using ThreadMapB1 = typename MmaCore1::SmemThreadMapB;
-  using IteratorB1 =
-    cutlass::conv::threadblock::TileIterator<
-      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
-        cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
-        ElementB, layout::TensorCxRSKx<InterleavedK>,
-        ThreadMapB1
-      >
-    >;
-  
-  using SmemIteratorB1 = typename MmaCore1::SmemIteratorB;
-
-  // Warp-level GEMM components
-  using WarpMmaTensorOp0 = typename MmaCore0::MmaTensorOp;
-  using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp;
-  using MmaPolicy0 = typename MmaCore0::MmaPolicy;
-  using MmaPolicy1 = typename MmaCore1::MmaPolicy;
-
-  // Use fragment iterator for the accumulator
-  using SmemAccumulatorLayout = cutlass::layout::ColumnMajorInterleaved<16>;
-  using FragmentIteratorAccumulator = cutlass::epilogue::warp::FragmentIteratorTensorOp<
-          WarpShape0, InstructionShape,
-          ElementAccumulator,
-          typename WarpMmaTensorOp0::Policy::Operator::FragmentC,
-          SmemAccumulatorLayout
-        >;
-
-
-  // Store Accumulator tiles to Shared Memory
-  using SmemIteratorD0 = 
-      cutlass::epilogue::warp::TileIteratorTensorOp<
-          WarpShape0,
-          InstructionShape,
-          ElementC,
-          SmemAccumulatorLayout
-        >;
-
-  static int const kThreadCount = 32;
-  // load warp tile from Shared Memory accumulator
-  using WarpIteratorA1 = cutlass::gemm::warp::MmaTensorOpMultiplicandTileIteratorCanonical<
-    MatrixShape<WarpShape1::kM, InstructionShape::kK>, cutlass::gemm::Operand::kA, 
-    ElementA, SmemAccumulatorLayout,
-    MatrixShape<InstructionShape::kM, InstructionShape::kK>,
-    WarpMmaTensorOp1::Policy::OpDelta::kRow, kThreadCount>;
- 
-  // Define the Mma
-  using B2bMma = threadblock::B2bImplicitGemmPipelinedSmemAccumulator<
-    ThreadblockShape0,
-    IteratorA0,
-    SmemIteratorA0,
-    IteratorB0,
-    SmemIteratorB0,
-    IteratorAccumulatorScaleBias,
-    FragmentIteratorAccumulator,
-    SmemIteratorD0,
-    ThreadblockShape1,
-    WarpIteratorA1,
-    IteratorB1,
-    SmemIteratorB1,
-    ElementC,
-    LayoutC,
-    EpilogueOutputOp0,
-    MmaPolicy0,
-    MmaPolicy1
-  >;
-
-  // Define the epilogue
-  using Epilogue = typename epilogue::threadblock::DefaultInterleavedConvEpilogue<
-    ThreadblockShape1,
-    WarpMmaTensorOp1,
-    1,
-    EpilogueOutputOp1,
-    EpilogueOutputOp1::kCount,
-    InterleavedK
-  >::Epilogue;
-
-  // Define the kernel
-  using Kernel = cutlass::conv::kernel::B2bImplicitGemmConvolution<
-    B2bMma,
-    Epilogue,
-    ThreadblockSwizzle,
-    conv::Operator::kFprop
-  >;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
 } // namespace kernel
 } // namespace conv
 } // namespace cutlass
diff --git a/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop_sm75.h b/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop_sm75.h
new file mode 100644
index 00000000..bd07b8b9
--- /dev/null
+++ b/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop_sm75.h
@@ -0,0 +1,749 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+    Default kernel-level implicit GEMM convolution definitions combine threadblock-scoped 
+      matrix multiply-add with the appropriate threadblock-scoped epilogue.  
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/kernel/default_conv2d.h"
+
+#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h"
+
+#include "cutlass/transform/threadblock/predicated_vector_access_iterator.h"
+#include "cutlass/transform/threadblock/vector_iterator.h"
+#include "cutlass/transform/warp/vector_fragment_iterator.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h"
+
+#include "kernel/default_b2b_conv2d_fprop.h"
+#include "kernel/b2b_implicit_gemm_convolution.h"
+#include "threadblock/b2b_implicit_gemm_pipelined.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                         OpClassTensorOp convolutions 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm
+/// and 2 stage pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape0,
+  typename ThreadblockShape1,
+  typename WarpShape0,
+  typename WarpShape1,
+  typename InstructionShape,
+  typename EpilogueOutputOp0,
+  typename EpilogueOutputOp1,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag
+>
+struct DefaultB2bConv2dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape0,
+  ThreadblockShape1,
+  WarpShape0,
+  WarpShape1,
+  InstructionShape,
+  EpilogueOutputOp0,
+  EpilogueOutputOp1,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic
+> {
+
+  // Define the core components from GEMM
+  using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape0, WarpShape0, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      2, MathOperatorTag>;
+  using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape1, WarpShape1, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA0 = typename MmaCore0::IteratorThreadMapA;
+  using IteratorA0 =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
+        ElementA, LayoutA,
+        ThreadMapA0
+      >
+    >;
+
+  using SmemIteratorA0 = typename MmaCore0::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB0 = typename MmaCore0::IteratorThreadMapB;
+  using IteratorB0 =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
+        ElementB, LayoutB,
+        ThreadMapB0
+      >
+    >;
+  
+  using SmemIteratorB0 = typename MmaCore0::SmemIteratorB;
+
+  // Use fragment iterator for A operand
+  using AccumulatorLayout = cutlass::layout::ColumnMajor;
+  using FragmentIteratorA1 = 
+      cutlass::gemm::warp::MmaTensorOpFragmentIterator<
+          cutlass::MatrixShape<MmaCore1::WarpShape::kM, MmaCore1::InstructionShape::kK>, //warp shape
+          cutlass::MatrixShape<MmaCore0::WarpShape::kM, MmaCore0::WarpShape::kN>, //accumulator shape
+          MmaCore1::Shape::kK, //kBlocksColumn
+          ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp0>;
+
+  /// Define iterators over tiles from scale/bias vectors
+  using ElementScaleBias = typename EpilogueOutputOp0::ElementCompute;
+  using LayoutScaleBias = layout::RowMajor; //vector layout doesn't really matter
+  static int const kElementsPerAccess = 2;
+  using IteratorAccumulatorScaleBias =
+    cutlass::transform::threadblock::VectorIterator<
+      cutlass::transform::threadblock::PredicatedVectorAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kN>,
+          cutlass::MatrixShape<WarpShape1::kM, WarpShape1::kK>,
+          ElementScaleBias, LayoutScaleBias, kElementsPerAccess>
+    >;
+
+  // Warp-level iterators to load scale and bias vectors
+  using FragmentIteratorA1ScaleBias = cutlass::transform::warp::VectorFragmentIterator<
+      MatrixShape<1, IteratorAccumulatorScaleBias::Fragment::kElements>, ElementScaleBias,
+      LayoutScaleBias, InstructionShape, kElementsPerAccess>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB1 = typename MmaCore1::IteratorThreadMapB;
+  using IteratorB1 =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
+        ElementB, LayoutB,
+        ThreadMapB1
+      >
+    >;
+  
+  using SmemIteratorB1 = typename MmaCore1::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp;
+  using MmaPolicy0 = typename MmaCore0::MmaPolicy;
+  using MmaPolicy1 = typename MmaCore1::MmaPolicy;
+
+  // Define the Mma
+  using B2bMma = threadblock::B2bImplicitGemmPipelined<
+    ThreadblockShape0,
+    IteratorA0,
+    SmemIteratorA0,
+    IteratorB0,
+    SmemIteratorB0,
+    ThreadblockShape1,
+    FragmentIteratorA1,
+    IteratorAccumulatorScaleBias,
+    FragmentIteratorA1ScaleBias,
+    IteratorB1,
+    SmemIteratorB1,
+    ElementC,
+    LayoutC,
+    EpilogueOutputOp0,
+    MmaPolicy0,
+    MmaPolicy1
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename detail::DefaultConvEpilogue<
+    ArchTag,
+    ThreadblockShape1,
+    WarpMmaTensorOp1,
+    1,
+    EpilogueOutputOp1
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::B2bImplicitGemmConvolution<
+    B2bMma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm and 2 stage 
+/// pipeline with interleaved layout.
+template <
+  typename ElementA,
+  typename ElementB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape0,
+  typename ThreadblockShape1,
+  typename WarpShape0,
+  typename WarpShape1,
+  typename InstructionShape,
+  typename EpilogueOutputOp0,
+  typename EpilogueOutputOp1,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  int InterleavedK
+>
+struct DefaultB2bConv2dFprop <
+  ElementA,
+  layout::TensorNCxHWx<InterleavedK>,
+  ElementB,
+  layout::TensorCxRSKx<InterleavedK>,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape0,
+  ThreadblockShape1,
+  WarpShape0,
+  WarpShape1,
+  InstructionShape,
+  EpilogueOutputOp0,
+  EpilogueOutputOp1,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  false
+> {
+
+  // Define the core components from GEMM
+  using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape0, WarpShape0, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
+      ElementB, layout::RowMajorInterleaved<InterleavedK>, 
+      ElementAccumulator, LayoutC, arch::OpClassTensorOp,
+      2, MathOperatorTag, true>;
+  using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape1, WarpShape1, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
+      ElementB, layout::RowMajorInterleaved<InterleavedK>, 
+      ElementAccumulator, LayoutC, arch::OpClassTensorOp,
+      2, MathOperatorTag, true>;
+
+  // Define iterators over tiles from the A operand
+  // Note GEMM shared memory threadmap is used here because conv global memory
+  // layout needs to be mapped to fprop which is similar to the crosswise
+  // layout which is used by the interleaved GEMM shared memory threadmap.
+  // The Interleaved GEMM global memory layout is similar to the congruous
+  // layout.
+  using ThreadMapA0 = typename MmaCore0::SmemThreadMapA;
+  using IteratorA0 =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
+        ElementA, layout::TensorNCxHWx<InterleavedK>,
+        ThreadMapA0
+      >
+    >;
+
+  using SmemIteratorA0 = typename MmaCore0::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  // Note GEMM shared memory threadmap is used here because conv global memory
+  // layout needs to be mapped to fprop which is similar to the crosswise
+  // layout which is used by the interleaved GEMM shared memory threadmap.
+  // The Interleaved GEMM global memory layout is similar to the congruous
+  // layout.
+  using ThreadMapB0 = typename MmaCore0::SmemThreadMapB;
+  using IteratorB0 =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
+        ElementB, layout::TensorCxRSKx<InterleavedK>,
+        ThreadMapB0
+      >
+    >;
+  
+  using SmemIteratorB0 = typename MmaCore0::SmemIteratorB;
+
+  // Use fragment iterator for A operand
+  using AccumulatorLayout = cutlass::layout::RowMajor;
+  using FragmentIteratorA1 = 
+      cutlass::gemm::warp::MmaTensorOpFragmentIterator<
+          cutlass::MatrixShape<MmaCore1::WarpShape::kM, MmaCore1::InstructionShape::kK>, //warp shape
+          cutlass::MatrixShape<MmaCore0::WarpShape::kM, MmaCore0::WarpShape::kN>, //accumulator shape
+          MmaCore1::Shape::kK, //kBlocksColumn
+          ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp0>;
+
+  /// Define iterators over tiles from scale/bias vectors
+  using ElementScaleBias = typename EpilogueOutputOp0::ElementCompute;
+  using LayoutScaleBias = layout::RowMajor; //vector layout doesn't really matter
+  static int const kElementsPerAccess = 4;
+  using IteratorAccumulatorScaleBias =
+    cutlass::transform::threadblock::VectorIterator<
+      cutlass::transform::threadblock::PredicatedVectorAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kN>, 
+          cutlass::MatrixShape<WarpShape1::kM, WarpShape1::kK>, 
+          ElementScaleBias, LayoutScaleBias, kElementsPerAccess>
+    >;
+
+  // Warp-level iterators to load scale and bias vectors
+  using FragmentIteratorA1ScaleBias = cutlass::transform::warp::VectorFragmentIterator<
+      MatrixShape<1, IteratorAccumulatorScaleBias::Fragment::kElements>, ElementScaleBias,
+      LayoutScaleBias, InstructionShape, kElementsPerAccess>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB1 = typename MmaCore1::SmemThreadMapB;
+  using IteratorB1 =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
+        ElementB, layout::TensorCxRSKx<InterleavedK>,
+        ThreadMapB1
+      >
+    >;
+  
+  using SmemIteratorB1 = typename MmaCore1::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp;
+  using MmaPolicy0 = typename MmaCore0::MmaPolicy;
+  using MmaPolicy1 = typename MmaCore1::MmaPolicy;
+
+  // Define the Mma
+  using B2bMma = threadblock::B2bImplicitGemmPipelined<
+    ThreadblockShape0,
+    IteratorA0,
+    SmemIteratorA0,
+    IteratorB0,
+    SmemIteratorB0,
+    ThreadblockShape1,
+    FragmentIteratorA1,
+    IteratorAccumulatorScaleBias,
+    FragmentIteratorA1ScaleBias,
+    IteratorB1,
+    SmemIteratorB1,
+    ElementC,
+    LayoutC,
+    EpilogueOutputOp0,
+    MmaPolicy0,
+    MmaPolicy1
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultInterleavedConvEpilogue<
+    ThreadblockShape1,
+    WarpMmaTensorOp1,
+    1,
+    EpilogueOutputOp1,
+    EpilogueOutputOp1::kCount,
+    InterleavedK
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::B2bImplicitGemmConvolution<
+    B2bMma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialization for Optimized IteratorAlgorithm
+/// and 2 stage pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape0,
+  typename ThreadblockShape1,
+  typename WarpShape0,
+  typename WarpShape1,
+  typename InstructionShape,
+  typename EpilogueOutputOp0,
+  typename EpilogueOutputOp1,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag
+>
+struct DefaultB2bConv2dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape0,
+  ThreadblockShape1,
+  WarpShape0,
+  WarpShape1,
+  InstructionShape,
+  EpilogueOutputOp0,
+  EpilogueOutputOp1,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized
+> {
+
+  // Define the core components from GEMM
+  using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape0, WarpShape0, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      2, MathOperatorTag>;
+  using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape1, WarpShape1, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA0 = typename MmaCore0::IteratorThreadMapA;
+  using IteratorA0 =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
+        ElementA, LayoutA,
+        ThreadMapA0
+      >
+    >;
+
+  using SmemIteratorA0 = typename MmaCore0::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB0 = typename MmaCore0::IteratorThreadMapB;
+  using IteratorB0 =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
+        ElementB, LayoutB,
+        ThreadMapB0
+      >
+    >;
+  
+  using SmemIteratorB0 = typename MmaCore0::SmemIteratorB;
+
+  // Use fragment iterator for A operand
+  using AccumulatorLayout = cutlass::layout::ColumnMajor;
+  using FragmentIteratorA1 = 
+      cutlass::gemm::warp::MmaTensorOpFragmentIterator<
+          cutlass::MatrixShape<MmaCore1::WarpShape::kM, MmaCore1::InstructionShape::kK>, //warp shape
+          cutlass::MatrixShape<MmaCore0::WarpShape::kM, MmaCore0::WarpShape::kN>, //accumulator shape
+          MmaCore1::Shape::kK, //kBlocksColumn
+          ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp0>;
+
+  /// Define iterators over tiles from scale/bias vectors
+  using ElementScaleBias = typename EpilogueOutputOp0::ElementCompute;
+  using LayoutScaleBias = layout::RowMajor; //vector layout doesn't really matter
+  static int const kElementsPerAccess = 2;
+  using IteratorAccumulatorScaleBias =
+    cutlass::transform::threadblock::VectorIterator<
+      cutlass::transform::threadblock::PredicatedVectorAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kN>,
+          cutlass::MatrixShape<WarpShape1::kM, WarpShape1::kK>,
+          ElementScaleBias, LayoutScaleBias, kElementsPerAccess>
+    >;
+
+  // Warp-level iterators to load scale and bias vectors
+  using FragmentIteratorA1ScaleBias = cutlass::transform::warp::VectorFragmentIterator<
+      MatrixShape<1, IteratorAccumulatorScaleBias::Fragment::kElements>, ElementScaleBias,
+      LayoutScaleBias, InstructionShape, kElementsPerAccess>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB1 = typename MmaCore1::IteratorThreadMapB;
+  using IteratorB1 =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
+        ElementB, LayoutB,
+        ThreadMapB1
+      >
+    >;
+  
+  using SmemIteratorB1 = typename MmaCore1::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp;
+  using MmaPolicy0 = typename MmaCore0::MmaPolicy;
+  using MmaPolicy1 = typename MmaCore1::MmaPolicy;
+
+  // Define the Mma
+  using B2bMma = threadblock::B2bImplicitGemmPipelined<
+    ThreadblockShape0,
+    IteratorA0,
+    SmemIteratorA0,
+    IteratorB0,
+    SmemIteratorB0,
+    ThreadblockShape1,
+    FragmentIteratorA1,
+    IteratorAccumulatorScaleBias,
+    FragmentIteratorA1ScaleBias,
+    IteratorB1,
+    SmemIteratorB1,
+    ElementC,
+    LayoutC,
+    EpilogueOutputOp0,
+    MmaPolicy0,
+    MmaPolicy1
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename detail::DefaultConvEpilogue<
+    ArchTag,
+    ThreadblockShape1,
+    WarpMmaTensorOp1,
+    1,
+    EpilogueOutputOp1
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::B2bImplicitGemmConvolution<
+    B2bMma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialization for Optimized IteratorAlgorithm and 2 stage 
+/// pipeline with interleaved layout.
+template <
+  typename ElementA,
+  typename ElementB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape0,
+  typename ThreadblockShape1,
+  typename WarpShape0,
+  typename WarpShape1,
+  typename InstructionShape,
+  typename EpilogueOutputOp0,
+  typename EpilogueOutputOp1,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  int InterleavedK
+>
+struct DefaultB2bConv2dFprop <
+  ElementA,
+  layout::TensorNCxHWx<InterleavedK>,
+  ElementB,
+  layout::TensorCxRSKx<InterleavedK>,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape0,
+  ThreadblockShape1,
+  WarpShape0,
+  WarpShape1,
+  InstructionShape,
+  EpilogueOutputOp0,
+  EpilogueOutputOp1,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized
+> {
+
+  // Define the core components from GEMM
+  using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape0, WarpShape0, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
+      ElementB, layout::RowMajorInterleaved<InterleavedK>, 
+      ElementAccumulator, LayoutC, arch::OpClassTensorOp,
+      2, MathOperatorTag, true>;
+  using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape1, WarpShape1, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
+      ElementB, layout::RowMajorInterleaved<InterleavedK>, 
+      ElementAccumulator, LayoutC, arch::OpClassTensorOp,
+      2, MathOperatorTag, true>;
+
+  // Define iterators over tiles from the A operand
+  // Note GEMM shared memory threadmap is used here because conv global memory
+  // layout needs to be mapped to fprop which is similar to the crosswise
+  // layout which is used by the interleaved GEMM shared memory threadmap.
+  // The Interleaved GEMM global memory layout is similar to the congruous
+  // layout.
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA0 = typename MmaCore0::SmemThreadMapA;
+  using IteratorA0 =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
+        ElementA, layout::TensorNCxHWx<InterleavedK>,
+        ThreadMapA0
+      >
+    >;
+
+  using SmemIteratorA0 = typename MmaCore0::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB0 = typename MmaCore0::SmemThreadMapB;
+  using IteratorB0 =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
+        ElementB, layout::TensorCxRSKx<InterleavedK>,
+        ThreadMapB0
+      >
+    >;
+  
+  using SmemIteratorB0 = typename MmaCore0::SmemIteratorB;
+
+  // Use fragment iterator for A operand
+  using AccumulatorLayout = cutlass::layout::RowMajor;
+  using FragmentIteratorA1 = 
+      cutlass::gemm::warp::MmaTensorOpFragmentIterator<
+          cutlass::MatrixShape<MmaCore1::WarpShape::kM, MmaCore1::InstructionShape::kK>, //warp shape
+          cutlass::MatrixShape<MmaCore0::WarpShape::kM, MmaCore0::WarpShape::kN>, //accumulator shape
+          MmaCore1::Shape::kK, //kBlocksColumn
+          ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp0>;
+
+  /// Define iterators over tiles from scale/bias vectors
+  using ElementScaleBias = typename EpilogueOutputOp0::ElementCompute;
+  using LayoutScaleBias = layout::RowMajor; //vector layout doesn't really matter
+  static int const kElementsPerAccess = 4;
+  using IteratorAccumulatorScaleBias =
+    cutlass::transform::threadblock::VectorIterator<
+      cutlass::transform::threadblock::PredicatedVectorAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kN>, 
+          cutlass::MatrixShape<WarpShape1::kM, WarpShape1::kK>, 
+          ElementScaleBias, LayoutScaleBias, kElementsPerAccess>
+    >;
+
+  // Warp-level iterators to load scale and bias vectors
+  using FragmentIteratorA1ScaleBias = cutlass::transform::warp::VectorFragmentIterator<
+      MatrixShape<1, IteratorAccumulatorScaleBias::Fragment::kElements>, ElementScaleBias,
+      LayoutScaleBias, InstructionShape, kElementsPerAccess>;
+
+  using ThreadMapB1 = typename MmaCore1::SmemThreadMapB;
+  using IteratorB1 =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
+        ElementB, layout::TensorCxRSKx<InterleavedK>,
+        ThreadMapB1
+      >
+    >;
+  
+  using SmemIteratorB1 = typename MmaCore1::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp;
+  using MmaPolicy0 = typename MmaCore0::MmaPolicy;
+  using MmaPolicy1 = typename MmaCore1::MmaPolicy;
+
+  // Define the Mma
+  using B2bMma = threadblock::B2bImplicitGemmPipelined<
+    ThreadblockShape0,
+    IteratorA0,
+    SmemIteratorA0,
+    IteratorB0,
+    SmemIteratorB0,
+    ThreadblockShape1,
+    FragmentIteratorA1,
+    IteratorAccumulatorScaleBias,
+    FragmentIteratorA1ScaleBias,
+    IteratorB1,
+    SmemIteratorB1,
+    ElementC,
+    LayoutC,
+    EpilogueOutputOp0,
+    MmaPolicy0,
+    MmaPolicy1
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultInterleavedConvEpilogue<
+    ThreadblockShape1,
+    WarpMmaTensorOp1,
+    1,
+    EpilogueOutputOp1,
+    EpilogueOutputOp1::kCount,
+    InterleavedK
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::B2bImplicitGemmConvolution<
+    B2bMma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop_sm80.h b/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop_sm80.h
new file mode 100644
index 00000000..ab45a7a4
--- /dev/null
+++ b/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop_sm80.h
@@ -0,0 +1,740 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+    Default kernel-level implicit GEMM convolution definitions combine threadblock-scoped 
+      matrix multiply-add with the appropriate threadblock-scoped epilogue.  
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/kernel/default_conv2d.h"
+
+#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h"
+
+#include "cutlass/transform/threadblock/predicated_vector_access_iterator.h"
+#include "cutlass/transform/threadblock/vector_iterator.h"
+#include "cutlass/transform/warp/vector_fragment_iterator.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h"
+
+#include "kernel/default_b2b_conv2d_fprop.h"
+#include "kernel/b2b_implicit_gemm_convolution.h"
+#include "threadblock/b2b_implicit_gemm_multistage.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                         OpClassTensorOp convolutions 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm and multistage 
+/// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape0,
+  typename ThreadblockShape1,
+  typename WarpShape0,
+  typename WarpShape1,
+  typename InstructionShape,
+  typename EpilogueOutputOp0,
+  typename EpilogueOutputOp1,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultB2bConv2dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape0,
+  ThreadblockShape1,
+  WarpShape0,
+  WarpShape1,
+  InstructionShape,
+  EpilogueOutputOp0,
+  EpilogueOutputOp1,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic
+> {
+
+  // Define the core components from GEMM
+  using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape0, WarpShape0, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, MathOperatorTag>;
+  using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape1, WarpShape1, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA0 = typename MmaCore0::IteratorThreadMapA;
+  using IteratorA0 =
+    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
+      ElementA, LayoutA,
+      ThreadMapA0
+    >;
+
+  using SmemIteratorA0 = typename MmaCore0::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB0 = typename MmaCore0::IteratorThreadMapB;
+  using IteratorB0 =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
+      ElementB, LayoutB,
+      ThreadMapB0
+    >;
+  
+  using SmemIteratorB0 = typename MmaCore0::SmemIteratorB;
+
+  // Use fragment iterator for A operand
+  using AccumulatorLayout = cutlass::layout::ColumnMajor;
+  using FragmentIteratorA1 = 
+      cutlass::gemm::warp::MmaTensorOpFragmentIterator<
+          cutlass::MatrixShape<MmaCore1::WarpShape::kM, MmaCore1::InstructionShape::kK>, //warp shape
+          cutlass::MatrixShape<MmaCore0::WarpShape::kM, MmaCore0::WarpShape::kN>, //accumulator shape
+          MmaCore1::Shape::kK, //kBlocksColumn
+          ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp0>;
+
+  /// Define iterators over tiles from scale/bias vectors
+  using ElementScaleBias = typename EpilogueOutputOp0::ElementCompute;
+  using LayoutScaleBias = layout::RowMajor; //vector layout doesn't really matter
+  static int const kElementsPerAccess = 2;
+  using IteratorAccumulatorScaleBias =
+    cutlass::transform::threadblock::VectorIterator<
+      cutlass::transform::threadblock::PredicatedVectorAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kN>,
+          cutlass::MatrixShape<WarpShape1::kM, WarpShape1::kK>,
+          ElementScaleBias, LayoutScaleBias, kElementsPerAccess>
+    >;
+
+  // Warp-level iterators to load scale and bias vectors
+  using FragmentIteratorA1ScaleBias = cutlass::transform::warp::VectorFragmentIterator<
+      MatrixShape<1, IteratorAccumulatorScaleBias::Fragment::kElements>, ElementScaleBias,
+      LayoutScaleBias, InstructionShape, kElementsPerAccess>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB1 = typename MmaCore1::IteratorThreadMapB;
+  using IteratorB1 =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
+      ElementB, LayoutB,
+      ThreadMapB1
+    >;
+  
+  using SmemIteratorB1 = typename MmaCore1::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp;
+  using MmaPolicy0 = typename MmaCore0::MmaPolicy;
+  using MmaPolicy1 = typename MmaCore1::MmaPolicy;
+
+  // Define the Mma
+  using B2bMma = threadblock::B2bImplicitGemmMultistage<
+    ThreadblockShape0,
+    IteratorA0,
+    SmemIteratorA0,
+    arch::CacheOperation::Always,
+    IteratorB0,
+    SmemIteratorB0,
+    arch::CacheOperation::Global,
+    ThreadblockShape1,
+    FragmentIteratorA1,
+    IteratorAccumulatorScaleBias,
+    FragmentIteratorA1ScaleBias,
+    IteratorB1,
+    SmemIteratorB1,
+    arch::CacheOperation::Global,
+    EpilogueOutputOp0,
+    MmaPolicy0,
+    MmaPolicy1,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape1,
+    WarpMmaTensorOp1,
+    1,
+    EpilogueOutputOp1,
+    EpilogueOutputOp1::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::B2bImplicitGemmConvolution<
+    B2bMma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm and multistage 
+/// pipeline with interleaved layout.
+template <
+  typename ElementA,
+  typename ElementB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape0,
+  typename ThreadblockShape1,
+  typename WarpShape0,
+  typename WarpShape1,
+  typename InstructionShape,
+  typename EpilogueOutputOp0,
+  typename EpilogueOutputOp1,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  int InterleavedK
+>
+struct DefaultB2bConv2dFprop <
+  ElementA,
+  layout::TensorNCxHWx<InterleavedK>,
+  ElementB,
+  layout::TensorCxRSKx<InterleavedK>,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape0,
+  ThreadblockShape1,
+  WarpShape0,
+  WarpShape1,
+  InstructionShape,
+  EpilogueOutputOp0,
+  EpilogueOutputOp1,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic
+> {
+
+  // Define the core components from GEMM
+  using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape0, WarpShape0, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
+      ElementB, layout::RowMajorInterleaved<InterleavedK>,
+      ElementAccumulator, LayoutC, arch::OpClassTensorOp,
+      Stages, MathOperatorTag, true>;
+  using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape1, WarpShape1, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
+      ElementB, layout::RowMajorInterleaved<InterleavedK>,
+      ElementAccumulator, LayoutC, arch::OpClassTensorOp,
+      Stages, MathOperatorTag, true>;
+
+  // Define iterators over tiles from the A operand
+  // Note GEMM shared memory threadmap is used here because conv global memory
+  // layout needs to be mapped to fprop which is similar to the crosswise
+  // layout which is used by the interleaved GEMM shared memory threadmap.
+  // The Interleaved GEMM global memory layout is similar to the congruous
+  // layout.
+  using ThreadMapA0 = typename MmaCore0::SmemThreadMapA;
+  using IteratorA0 =
+    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
+      ElementA, layout::TensorNCxHWx<InterleavedK>,
+      ThreadMapA0
+    >;
+
+  using SmemIteratorA0 = typename MmaCore0::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  // Note GEMM shared memory threadmap is used here because conv global memory
+  // layout needs to be mapped to fprop which is similar to the crosswise
+  // layout which is used by the interleaved GEMM shared memory threadmap.
+  // The Interleaved GEMM global memory layout is similar to the congruous
+  // layout.
+  using ThreadMapB0 = typename MmaCore0::SmemThreadMapB;
+  using IteratorB0 =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
+      ElementB, layout::TensorCxRSKx<InterleavedK>,
+      ThreadMapB0
+    >;
+  
+  using SmemIteratorB0 = typename MmaCore0::SmemIteratorB;
+
+  // Use fragment iterator for A operand
+  using AccumulatorLayout = cutlass::layout::RowMajor;
+  using FragmentIteratorA1 = 
+      cutlass::gemm::warp::MmaTensorOpFragmentIterator<
+          cutlass::MatrixShape<MmaCore1::WarpShape::kM, MmaCore1::InstructionShape::kK>, //warp shape
+          cutlass::MatrixShape<MmaCore0::WarpShape::kM, MmaCore0::WarpShape::kN>, //accumulator shape
+          MmaCore1::Shape::kK, //kBlocksColumn
+          ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp0>;
+
+  /// Define iterators over tiles from scale/bias vectors
+  using ElementScaleBias = typename EpilogueOutputOp0::ElementCompute;
+  using LayoutScaleBias = layout::RowMajor; //vector layout doesn't really matter
+  static int const kElementsPerAccess = 4;
+  using IteratorAccumulatorScaleBias =
+    cutlass::transform::threadblock::VectorIterator<
+      cutlass::transform::threadblock::PredicatedVectorAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kN>, 
+          cutlass::MatrixShape<WarpShape1::kM, WarpShape1::kK>, 
+          ElementScaleBias, LayoutScaleBias, kElementsPerAccess>
+    >;
+
+  // Warp-level iterators to load scale and bias vectors
+  using FragmentIteratorA1ScaleBias = cutlass::transform::warp::VectorFragmentIterator<
+      MatrixShape<1, IteratorAccumulatorScaleBias::Fragment::kElements>, ElementScaleBias,
+      LayoutScaleBias, InstructionShape, kElementsPerAccess>;
+
+  using ThreadMapB1 = typename MmaCore1::SmemThreadMapB;
+  using IteratorB1 =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
+      ElementB, layout::TensorCxRSKx<InterleavedK>,
+      ThreadMapB1
+    >;
+ 
+  using SmemIteratorB1 = typename MmaCore1::SmemIteratorB;
+
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp;
+  using MmaPolicy0 = typename MmaCore0::MmaPolicy;
+  using MmaPolicy1 = typename MmaCore1::MmaPolicy;
+
+  // Define the Mma
+  using B2bMma = threadblock::B2bImplicitGemmMultistage<
+    ThreadblockShape0,
+    IteratorA0,
+    SmemIteratorA0,
+    arch::CacheOperation::Always,
+    IteratorB0,
+    SmemIteratorB0,
+    arch::CacheOperation::Global,
+    ThreadblockShape1,
+    FragmentIteratorA1,
+    IteratorAccumulatorScaleBias,
+    FragmentIteratorA1ScaleBias,
+    IteratorB1,
+    SmemIteratorB1,
+    arch::CacheOperation::Global,
+    EpilogueOutputOp0,
+    MmaPolicy0,
+    MmaPolicy1,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultInterleavedConvEpilogue<
+    ThreadblockShape1,
+    WarpMmaTensorOp1,
+    1,
+    EpilogueOutputOp1,
+    EpilogueOutputOp1::kCount,
+    InterleavedK
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::B2bImplicitGemmConvolution<
+    B2bMma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialization for Optimized IteratorAlgorithm and 
+/// multistage pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape0,
+  typename ThreadblockShape1,
+  typename WarpShape0,
+  typename WarpShape1,
+  typename InstructionShape,
+  typename EpilogueOutputOp0,
+  typename EpilogueOutputOp1,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultB2bConv2dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape0,
+  ThreadblockShape1,
+  WarpShape0,
+  WarpShape1,
+  InstructionShape,
+  EpilogueOutputOp0,
+  EpilogueOutputOp1,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized
+> {
+
+  // Define the core components from GEMM
+  using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape0, WarpShape0, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, MathOperatorTag>;
+  using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape1, WarpShape1, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA0 = typename MmaCore0::IteratorThreadMapA;
+  using IteratorA0 =
+    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
+      ElementA, LayoutA,
+      ThreadMapA0
+    >;
+
+  using SmemIteratorA0 = typename MmaCore0::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB0 = typename MmaCore0::IteratorThreadMapB;
+  using IteratorB0 =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
+      ElementB, LayoutB,
+      ThreadMapB0
+    >;
+  
+  using SmemIteratorB0 = typename MmaCore0::SmemIteratorB;
+
+  // Use fragment iterator for A operand
+  using AccumulatorLayout = cutlass::layout::ColumnMajor;
+  using FragmentIteratorA1 = 
+      cutlass::gemm::warp::MmaTensorOpFragmentIterator<
+          cutlass::MatrixShape<MmaCore1::WarpShape::kM, MmaCore1::InstructionShape::kK>, //warp shape
+          cutlass::MatrixShape<MmaCore0::WarpShape::kM, MmaCore0::WarpShape::kN>, //accumulator shape
+          MmaCore1::Shape::kK, //kBlocksColumn
+          ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp0>;
+
+  /// Define iterators over tiles from scale/bias vectors
+  using ElementScaleBias = typename EpilogueOutputOp0::ElementCompute;
+  using LayoutScaleBias = layout::RowMajor; //vector layout doesn't really matter
+  static int const kElementsPerAccess = 2;
+  using IteratorAccumulatorScaleBias =
+    cutlass::transform::threadblock::VectorIterator<
+      cutlass::transform::threadblock::PredicatedVectorAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kN>,
+          cutlass::MatrixShape<WarpShape1::kM, WarpShape1::kK>,
+          ElementScaleBias, LayoutScaleBias, kElementsPerAccess>
+    >;
+
+  // Warp-level iterators to load scale and bias vectors
+  using FragmentIteratorA1ScaleBias = cutlass::transform::warp::VectorFragmentIterator<
+      MatrixShape<1, IteratorAccumulatorScaleBias::Fragment::kElements>, ElementScaleBias,
+      LayoutScaleBias, InstructionShape, kElementsPerAccess>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB1 = typename MmaCore1::IteratorThreadMapB;
+  using IteratorB1 =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
+      ElementB, LayoutB,
+      ThreadMapB1
+    >;
+  
+  using SmemIteratorB1 = typename MmaCore1::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp;
+  using MmaPolicy0 = typename MmaCore0::MmaPolicy;
+  using MmaPolicy1 = typename MmaCore1::MmaPolicy;
+
+  // Define the Mma
+  using B2bMma = threadblock::B2bImplicitGemmMultistage<
+    ThreadblockShape0,
+    IteratorA0,
+    SmemIteratorA0,
+    arch::CacheOperation::Always,
+    IteratorB0,
+    SmemIteratorB0,
+    arch::CacheOperation::Global,
+    ThreadblockShape1,
+    FragmentIteratorA1,
+    IteratorAccumulatorScaleBias,
+    FragmentIteratorA1ScaleBias,
+    IteratorB1,
+    SmemIteratorB1,
+    arch::CacheOperation::Global,
+    EpilogueOutputOp0,
+    MmaPolicy0,
+    MmaPolicy1,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape1,
+    WarpMmaTensorOp1,
+    1,
+    EpilogueOutputOp1,
+    EpilogueOutputOp1::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::B2bImplicitGemmConvolution<
+    B2bMma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialization for Optimzed IteratorAlgorithm and 
+// multistage pipeline with interleaved layout.
+template <
+  typename ElementA,
+  typename ElementB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape0,
+  typename ThreadblockShape1,
+  typename WarpShape0,
+  typename WarpShape1,
+  typename InstructionShape,
+  typename EpilogueOutputOp0,
+  typename EpilogueOutputOp1,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  int InterleavedK
+>
+struct DefaultB2bConv2dFprop <
+  ElementA,
+  layout::TensorNCxHWx<InterleavedK>,
+  ElementB,
+  layout::TensorCxRSKx<InterleavedK>,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape0,
+  ThreadblockShape1,
+  WarpShape0,
+  WarpShape1,
+  InstructionShape,
+  EpilogueOutputOp0,
+  EpilogueOutputOp1,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized
+> {
+
+  // Define the core components from GEMM
+  using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape0, WarpShape0, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
+      ElementB, layout::RowMajorInterleaved<InterleavedK>,
+      ElementAccumulator, LayoutC, arch::OpClassTensorOp,
+      Stages, MathOperatorTag, true>;
+  using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape1, WarpShape1, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
+      ElementB, layout::RowMajorInterleaved<InterleavedK>,
+      ElementAccumulator, LayoutC, arch::OpClassTensorOp,
+      Stages, MathOperatorTag, true>;
+
+  // Define iterators over tiles from the A operand
+  // Note GEMM shared memory threadmap is used here because conv global memory
+  // layout needs to be mapped to fprop which is similar to the crosswise
+  // layout which is used by the interleaved GEMM shared memory threadmap.
+  // The Interleaved GEMM global memory layout is similar to the congruous
+  // layout.
+  using ThreadMapA0 = typename MmaCore0::SmemThreadMapA;
+  using IteratorA0 =
+    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
+      ElementA, layout::TensorNCxHWx<InterleavedK>,
+      ThreadMapA0
+    >;
+
+  using SmemIteratorA0 = typename MmaCore0::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  // Note GEMM shared memory threadmap is used here because conv global memory
+  // layout needs to be mapped to fprop which is similar to the crosswise
+  // layout which is used by the interleaved GEMM shared memory threadmap.
+  // The Interleaved GEMM global memory layout is similar to the congruous
+  // layout.
+  using ThreadMapB0 = typename MmaCore0::SmemThreadMapB;
+  using IteratorB0 =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
+      ElementB, layout::TensorCxRSKx<InterleavedK>,
+      ThreadMapB0
+    >;
+  
+  using SmemIteratorB0 = typename MmaCore0::SmemIteratorB;
+
+  // Use fragment iterator for A operand
+  using AccumulatorLayout = cutlass::layout::RowMajor;
+  using FragmentIteratorA1 = 
+      cutlass::gemm::warp::MmaTensorOpFragmentIterator<
+          cutlass::MatrixShape<MmaCore1::WarpShape::kM, MmaCore1::InstructionShape::kK>, //warp shape
+          cutlass::MatrixShape<MmaCore0::WarpShape::kM, MmaCore0::WarpShape::kN>, //accumulator shape
+          MmaCore1::Shape::kK, //kBlocksColumn
+          ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp0>;
+
+  /// Define iterators over tiles from scale/bias vectors
+  using ElementScaleBias = typename EpilogueOutputOp0::ElementCompute;
+  using LayoutScaleBias = layout::RowMajor; //vector layout doesn't really matter
+  static int const kElementsPerAccess = 4;
+  using IteratorAccumulatorScaleBias =
+    cutlass::transform::threadblock::VectorIterator<
+      cutlass::transform::threadblock::PredicatedVectorAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kN>, 
+          cutlass::MatrixShape<WarpShape1::kM, WarpShape1::kK>, 
+          ElementScaleBias, LayoutScaleBias, kElementsPerAccess>
+    >;
+
+  // Warp-level iterators to load scale and bias vectors
+  using FragmentIteratorA1ScaleBias = cutlass::transform::warp::VectorFragmentIterator<
+      MatrixShape<1, IteratorAccumulatorScaleBias::Fragment::kElements>, ElementScaleBias,
+      LayoutScaleBias, InstructionShape, kElementsPerAccess>;
+
+  using ThreadMapB1 = typename MmaCore1::SmemThreadMapB;
+  using IteratorB1 =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
+      ElementB, layout::TensorCxRSKx<InterleavedK>,
+      ThreadMapB1
+    >;
+ 
+  using SmemIteratorB1 = typename MmaCore1::SmemIteratorB;
+
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp;
+  using MmaPolicy0 = typename MmaCore0::MmaPolicy;
+  using MmaPolicy1 = typename MmaCore1::MmaPolicy;
+
+  // Define the Mma
+  using B2bMma = threadblock::B2bImplicitGemmMultistage<
+    ThreadblockShape0,
+    IteratorA0,
+    SmemIteratorA0,
+    arch::CacheOperation::Always,
+    IteratorB0,
+    SmemIteratorB0,
+    arch::CacheOperation::Global,
+    ThreadblockShape1,
+    FragmentIteratorA1,
+    IteratorAccumulatorScaleBias,
+    FragmentIteratorA1ScaleBias,
+    IteratorB1,
+    SmemIteratorB1,
+    arch::CacheOperation::Global,
+    EpilogueOutputOp0,
+    MmaPolicy0,
+    MmaPolicy1,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultInterleavedConvEpilogue<
+    ThreadblockShape1,
+    WarpMmaTensorOp1,
+    1,
+    EpilogueOutputOp1,
+    EpilogueOutputOp1::kCount,
+    InterleavedK
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::B2bImplicitGemmConvolution<
+    B2bMma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop_smem_accumulator_sm75.h b/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop_smem_accumulator_sm75.h
new file mode 100644
index 00000000..454f13ff
--- /dev/null
+++ b/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop_smem_accumulator_sm75.h
@@ -0,0 +1,817 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+    Default kernel-level implicit GEMM convolution definitions combine threadblock-scoped 
+      matrix multiply-add with the appropriate threadblock-scoped epilogue.  
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/kernel/default_conv2d.h"
+
+#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h"
+
+#include "cutlass/transform/threadblock/predicated_vector_access_iterator.h"
+#include "cutlass/transform/threadblock/vector_iterator.h"
+#include "cutlass/transform/warp/vector_fragment_iterator.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h"
+
+#include "kernel/default_b2b_conv2d_fprop.h"
+#include "kernel/b2b_implicit_gemm_convolution.h"
+#include "threadblock/b2b_implicit_gemm_pipelined_smem_accumulator.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm
+/// and 2 stage pipeline.
+/// Accumulator will be staged in shared memory.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape0,
+  typename ThreadblockShape1,
+  typename WarpShape0,
+  typename WarpShape1,
+  typename InstructionShape,
+  typename EpilogueOutputOp0,
+  typename EpilogueOutputOp1,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag
+>
+struct DefaultB2bConv2dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape0,
+  ThreadblockShape1,
+  WarpShape0,
+  WarpShape1,
+  InstructionShape,
+  EpilogueOutputOp0,
+  EpilogueOutputOp1,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  true
+> {
+
+  // Define the core components from GEMM
+  using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape0, WarpShape0, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      2, MathOperatorTag>;
+  using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape1, WarpShape1, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA0 = typename MmaCore0::IteratorThreadMapA;
+  using IteratorA0 =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
+        ElementA, LayoutA,
+        ThreadMapA0
+      >
+    >;
+
+  using SmemIteratorA0 = typename MmaCore0::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB0 = typename MmaCore0::IteratorThreadMapB;
+  using IteratorB0 =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
+        ElementB, LayoutB,
+        ThreadMapB0
+      >
+    >;
+  
+  using SmemIteratorB0 = typename MmaCore0::SmemIteratorB;
+
+  /// Define iterators over tiles from scale/bias vectors
+  using ElementScaleBias = typename EpilogueOutputOp0::ElementCompute;
+  using LayoutScaleBias = layout::RowMajor; //vector layout doesn't really matter
+  static int const kElementsPerAccess = 2;
+  using IteratorAccumulatorScaleBias =
+    cutlass::transform::threadblock::VectorIterator<
+      cutlass::transform::threadblock::PredicatedVectorAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kN>, 
+          cutlass::MatrixShape<WarpShape0::kM, WarpShape0::kN>, 
+          ElementScaleBias, LayoutScaleBias, kElementsPerAccess>
+    >;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB1 = typename MmaCore1::IteratorThreadMapB;
+  using IteratorB1 =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
+        ElementB, LayoutB,
+        ThreadMapB1
+      >
+    >;
+  
+  using SmemIteratorB1 = typename MmaCore1::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp0 = typename MmaCore0::MmaTensorOp;
+  using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp;
+  using MmaPolicy0 = typename MmaCore0::MmaPolicy;
+  using MmaPolicy1 = typename MmaCore1::MmaPolicy;
+
+  // Use fragment iterator for the accumulator
+  using SmemAccumulatorLayout = cutlass::layout::RowMajor;
+  using FragmentIteratorAccumulator = cutlass::epilogue::warp::FragmentIteratorTensorOp<
+          WarpShape0, InstructionShape,
+          ElementAccumulator,
+          typename WarpMmaTensorOp0::Policy::Operator::FragmentC,
+          SmemAccumulatorLayout
+        >;
+
+  // Store Accumulator tiles to Shared Memory
+  using SmemIteratorD0 = 
+      cutlass::epilogue::warp::TileIteratorTensorOp<
+          WarpShape0,
+          InstructionShape,
+          ElementC,
+          SmemAccumulatorLayout
+        >;
+
+  static int const kThreadCount = 32;
+  // load warp tile from Shared Memory accumulator
+  using WarpIteratorA1 = cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator<
+    MatrixShape<WarpShape1::kM, InstructionShape::kK>, cutlass::gemm::Operand::kA, 
+    ElementA, SmemAccumulatorLayout,
+    MatrixShape<InstructionShape::kM, InstructionShape::kK>,
+    WarpMmaTensorOp1::Policy::OpDelta::kRow, kThreadCount>;
+ 
+  // Define the Mma
+  using B2bMma = threadblock::B2bImplicitGemmPipelinedSmemAccumulator<
+    ThreadblockShape0,
+    IteratorA0,
+    SmemIteratorA0,
+    IteratorB0,
+    SmemIteratorB0,
+    IteratorAccumulatorScaleBias,
+    FragmentIteratorAccumulator,
+    SmemIteratorD0,
+    ThreadblockShape1,
+    WarpIteratorA1,
+    IteratorB1,
+    SmemIteratorB1,
+    ElementC,
+    LayoutC,
+    EpilogueOutputOp0,
+    MmaPolicy0,
+    MmaPolicy1
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename detail::DefaultConvEpilogue<
+    ArchTag,
+    ThreadblockShape1,
+    WarpMmaTensorOp1,
+    1,
+    EpilogueOutputOp1
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::B2bImplicitGemmConvolution<
+    B2bMma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm and 2 stage 
+/// pipeline with interleaved layout.
+/// Accumulator will be staged in shared memory.
+template <
+  typename ElementA,
+  typename ElementB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape0,
+  typename ThreadblockShape1,
+  typename WarpShape0,
+  typename WarpShape1,
+  typename InstructionShape,
+  typename EpilogueOutputOp0,
+  typename EpilogueOutputOp1,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  int InterleavedK
+>
+struct DefaultB2bConv2dFprop <
+  ElementA,
+  layout::TensorNCxHWx<InterleavedK>,
+  ElementB,
+  layout::TensorCxRSKx<InterleavedK>,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape0,
+  ThreadblockShape1,
+  WarpShape0,
+  WarpShape1,
+  InstructionShape,
+  EpilogueOutputOp0,
+  EpilogueOutputOp1,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  true
+> {
+
+  // Define the core components from GEMM
+  using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape0, WarpShape0, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
+      ElementB, layout::RowMajorInterleaved<InterleavedK>, 
+      ElementAccumulator, LayoutC, arch::OpClassTensorOp,
+      2, MathOperatorTag, true>;
+  using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape1, WarpShape1, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
+      ElementB, layout::RowMajorInterleaved<InterleavedK>, 
+      ElementAccumulator, LayoutC, arch::OpClassTensorOp,
+      2, MathOperatorTag, true>;
+
+  // Define iterators over tiles from the A operand
+  // Note GEMM shared memory threadmap is used here because conv global memory
+  // layout needs to be mapped to fprop which is similar to the crosswise
+  // layout which is used by the interleaved GEMM shared memory threadmap.
+  // The Interleaved GEMM global memory layout is similar to the congruous
+  // layout.
+  using ThreadMapA0 = typename MmaCore0::SmemThreadMapA;
+  using IteratorA0 =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
+        ElementA, layout::TensorNCxHWx<InterleavedK>,
+        ThreadMapA0
+      >
+    >;
+
+  using SmemIteratorA0 = typename MmaCore0::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  // Note GEMM shared memory threadmap is used here because conv global memory
+  // layout needs to be mapped to fprop which is similar to the crosswise
+  // layout which is used by the interleaved GEMM shared memory threadmap.
+  // The Interleaved GEMM global memory layout is similar to the congruous
+  // layout.
+  using ThreadMapB0 = typename MmaCore0::SmemThreadMapB;
+  using IteratorB0 =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
+        ElementB, layout::TensorCxRSKx<InterleavedK>,
+        ThreadMapB0
+      >
+    >;
+  
+  using SmemIteratorB0 = typename MmaCore0::SmemIteratorB;
+
+  /// Define iterators over tiles from scale/bias vectors
+  using ElementScaleBias = typename EpilogueOutputOp0::ElementCompute;
+  using LayoutScaleBias = layout::RowMajor; //vector layout doesn't really matter
+  static int const kElementsPerAccess = 4; //For interleaved layout
+  using IteratorAccumulatorScaleBias =
+    cutlass::transform::threadblock::VectorIterator<
+      cutlass::transform::threadblock::PredicatedVectorAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kN>, 
+          cutlass::MatrixShape<WarpShape0::kM, WarpShape0::kN>, 
+          ElementScaleBias, LayoutScaleBias, kElementsPerAccess>
+    >;
+
+ // Define iterators over tiles from the B operand
+  using ThreadMapB1 = typename MmaCore1::SmemThreadMapB;
+  using IteratorB1 =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
+        ElementB, layout::TensorCxRSKx<InterleavedK>,
+        ThreadMapB1
+      >
+    >;
+  
+  using SmemIteratorB1 = typename MmaCore1::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp0 = typename MmaCore0::MmaTensorOp;
+  using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp;
+  using MmaPolicy0 = typename MmaCore0::MmaPolicy;
+  using MmaPolicy1 = typename MmaCore1::MmaPolicy;
+
+  // Use fragment iterator for the accumulator
+  using SmemAccumulatorLayout = cutlass::layout::ColumnMajorInterleaved<16>;
+  using FragmentIteratorAccumulator = cutlass::epilogue::warp::FragmentIteratorTensorOp<
+          WarpShape0, InstructionShape,
+          ElementAccumulator,
+          typename WarpMmaTensorOp0::Policy::Operator::FragmentC,
+          SmemAccumulatorLayout
+        >;
+
+
+  // Store Accumulator tiles to Shared Memory
+  using SmemIteratorD0 = 
+      cutlass::epilogue::warp::TileIteratorTensorOp<
+          WarpShape0,
+          InstructionShape,
+          ElementC,
+          SmemAccumulatorLayout
+        >;
+
+  static int const kThreadCount = 32;
+  // load warp tile from Shared Memory accumulator
+  using WarpIteratorA1 = cutlass::gemm::warp::MmaTensorOpMultiplicandTileIteratorCanonical<
+    MatrixShape<WarpShape1::kM, InstructionShape::kK>, cutlass::gemm::Operand::kA, 
+    ElementA, SmemAccumulatorLayout,
+    MatrixShape<InstructionShape::kM, InstructionShape::kK>,
+    WarpMmaTensorOp1::Policy::OpDelta::kRow, kThreadCount>;
+ 
+  // Define the Mma
+  using B2bMma = threadblock::B2bImplicitGemmPipelinedSmemAccumulator<
+    ThreadblockShape0,
+    IteratorA0,
+    SmemIteratorA0,
+    IteratorB0,
+    SmemIteratorB0,
+    IteratorAccumulatorScaleBias,
+    FragmentIteratorAccumulator,
+    SmemIteratorD0,
+    ThreadblockShape1,
+    WarpIteratorA1,
+    IteratorB1,
+    SmemIteratorB1,
+    ElementC,
+    LayoutC,
+    EpilogueOutputOp0,
+    MmaPolicy0,
+    MmaPolicy1
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultInterleavedConvEpilogue<
+    ThreadblockShape1,
+    WarpMmaTensorOp1,
+    1,
+    EpilogueOutputOp1,
+    EpilogueOutputOp1::kCount,
+    InterleavedK
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::B2bImplicitGemmConvolution<
+    B2bMma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialization for Optimized IteratorAlgorithm
+/// and 2 stage pipeline.
+/// Accumulator will be staged in shared memory.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape0,
+  typename ThreadblockShape1,
+  typename WarpShape0,
+  typename WarpShape1,
+  typename InstructionShape,
+  typename EpilogueOutputOp0,
+  typename EpilogueOutputOp1,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag
+>
+struct DefaultB2bConv2dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape0,
+  ThreadblockShape1,
+  WarpShape0,
+  WarpShape1,
+  InstructionShape,
+  EpilogueOutputOp0,
+  EpilogueOutputOp1,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  true
+> {
+
+  // Define the core components from GEMM
+  using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape0, WarpShape0, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      2, MathOperatorTag>;
+  using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape1, WarpShape1, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA0 = typename MmaCore0::IteratorThreadMapA;
+  using IteratorA0 =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
+        ElementA, LayoutA,
+        ThreadMapA0
+      >
+    >;
+
+  using SmemIteratorA0 = typename MmaCore0::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB0 = typename MmaCore0::IteratorThreadMapB;
+  using IteratorB0 =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
+        ElementB, LayoutB,
+        ThreadMapB0
+      >
+    >;
+  
+  using SmemIteratorB0 = typename MmaCore0::SmemIteratorB;
+
+  /// Define iterators over tiles from scale/bias vectors
+  using ElementScaleBias = typename EpilogueOutputOp0::ElementCompute;
+  using LayoutScaleBias = layout::RowMajor; //vector layout doesn't really matter
+  static int const kElementsPerAccess = 2;
+  using IteratorAccumulatorScaleBias =
+    cutlass::transform::threadblock::VectorIterator<
+      cutlass::transform::threadblock::PredicatedVectorAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kN>, 
+          cutlass::MatrixShape<WarpShape0::kM, WarpShape0::kN>, 
+          ElementScaleBias, LayoutScaleBias, kElementsPerAccess>
+    >;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB1 = typename MmaCore1::IteratorThreadMapB;
+  using IteratorB1 =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
+        ElementB, LayoutB,
+        ThreadMapB1
+      >
+    >;
+  
+  using SmemIteratorB1 = typename MmaCore1::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp0 = typename MmaCore0::MmaTensorOp;
+  using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp;
+  using MmaPolicy0 = typename MmaCore0::MmaPolicy;
+  using MmaPolicy1 = typename MmaCore1::MmaPolicy;
+
+  // Use fragment iterator for the accumulator
+  using SmemAccumulatorLayout = cutlass::layout::RowMajor;
+  using FragmentIteratorAccumulator = cutlass::epilogue::warp::FragmentIteratorTensorOp<
+          WarpShape0, InstructionShape,
+          ElementAccumulator,
+          typename WarpMmaTensorOp0::Policy::Operator::FragmentC,
+          SmemAccumulatorLayout
+        >;
+
+  // Store Accumulator tiles to Shared Memory
+  using SmemIteratorD0 = 
+      cutlass::epilogue::warp::TileIteratorTensorOp<
+          WarpShape0,
+          InstructionShape,
+          ElementC,
+          SmemAccumulatorLayout
+        >;
+
+  static int const kThreadCount = 32;
+  // load warp tile from Shared Memory accumulator
+  using WarpIteratorA1 = cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator<
+    MatrixShape<WarpShape1::kM, InstructionShape::kK>, cutlass::gemm::Operand::kA, 
+    ElementA, SmemAccumulatorLayout,
+    MatrixShape<InstructionShape::kM, InstructionShape::kK>,
+    WarpMmaTensorOp1::Policy::OpDelta::kRow, kThreadCount>;
+ 
+  // Define the Mma
+  using B2bMma = threadblock::B2bImplicitGemmPipelinedSmemAccumulator<
+    ThreadblockShape0,
+    IteratorA0,
+    SmemIteratorA0,
+    IteratorB0,
+    SmemIteratorB0,
+    IteratorAccumulatorScaleBias,
+    FragmentIteratorAccumulator,
+    SmemIteratorD0,
+    ThreadblockShape1,
+    WarpIteratorA1,
+    IteratorB1,
+    SmemIteratorB1,
+    ElementC,
+    LayoutC,
+    EpilogueOutputOp0,
+    MmaPolicy0,
+    MmaPolicy1
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename detail::DefaultConvEpilogue<
+    ArchTag,
+    ThreadblockShape1,
+    WarpMmaTensorOp1,
+    1,
+    EpilogueOutputOp1
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::B2bImplicitGemmConvolution<
+    B2bMma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialization for Optimized IteratorAlgorithm and 2 stage 
+/// pipeline with interleaved layout.
+/// Accumulator will be staged in shared memory.
+template <
+  typename ElementA,
+  typename ElementB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape0,
+  typename ThreadblockShape1,
+  typename WarpShape0,
+  typename WarpShape1,
+  typename InstructionShape,
+  typename EpilogueOutputOp0,
+  typename EpilogueOutputOp1,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  int InterleavedK
+>
+struct DefaultB2bConv2dFprop <
+  ElementA,
+  layout::TensorNCxHWx<InterleavedK>,
+  ElementB,
+  layout::TensorCxRSKx<InterleavedK>,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape0,
+  ThreadblockShape1,
+  WarpShape0,
+  WarpShape1,
+  InstructionShape,
+  EpilogueOutputOp0,
+  EpilogueOutputOp1,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  true
+> {
+
+  // Define the core components from GEMM
+  using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape0, WarpShape0, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
+      ElementB, layout::RowMajorInterleaved<InterleavedK>, 
+      ElementAccumulator, LayoutC, arch::OpClassTensorOp,
+      2, MathOperatorTag, true>;
+  using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape1, WarpShape1, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
+      ElementB, layout::RowMajorInterleaved<InterleavedK>, 
+      ElementAccumulator, LayoutC, arch::OpClassTensorOp,
+      2, MathOperatorTag, true>;
+
+  // Define iterators over tiles from the A operand
+  // Note GEMM shared memory threadmap is used here because conv global memory
+  // layout needs to be mapped to fprop which is similar to the crosswise
+  // layout which is used by the interleaved GEMM shared memory threadmap.
+  // The Interleaved GEMM global memory layout is similar to the congruous
+  // layout.
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA0 = typename MmaCore0::SmemThreadMapA;
+  using IteratorA0 =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
+        ElementA, layout::TensorNCxHWx<InterleavedK>,
+        ThreadMapA0
+      >
+    >;
+
+  using SmemIteratorA0 = typename MmaCore0::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  // Note GEMM shared memory threadmap is used here because conv global memory
+  // layout needs to be mapped to fprop which is similar to the crosswise
+  // layout which is used by the interleaved GEMM shared memory threadmap.
+  // The Interleaved GEMM global memory layout is similar to the congruous
+  // layout.
+  using ThreadMapB0 = typename MmaCore0::SmemThreadMapB;
+  using IteratorB0 =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
+        ElementB, layout::TensorCxRSKx<InterleavedK>,
+        ThreadMapB0
+      >
+    >;
+  
+  using SmemIteratorB0 = typename MmaCore0::SmemIteratorB;
+
+  /// Define iterators over tiles from scale/bias vectors
+  using ElementScaleBias = typename EpilogueOutputOp0::ElementCompute;
+  using LayoutScaleBias = layout::RowMajor; //vector layout doesn't really matter
+  static int const kElementsPerAccess = 4; //For interleaved layout
+  using IteratorAccumulatorScaleBias =
+    cutlass::transform::threadblock::VectorIterator<
+      cutlass::transform::threadblock::PredicatedVectorAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kN>, 
+          cutlass::MatrixShape<WarpShape0::kM, WarpShape0::kN>, 
+          ElementScaleBias, LayoutScaleBias, kElementsPerAccess>
+    >;
+
+  using ThreadMapB1 = typename MmaCore1::SmemThreadMapB;
+  using IteratorB1 =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
+        ElementB, layout::TensorCxRSKx<InterleavedK>,
+        ThreadMapB1
+      >
+    >;
+  
+  using SmemIteratorB1 = typename MmaCore1::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp0 = typename MmaCore0::MmaTensorOp;
+  using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp;
+  using MmaPolicy0 = typename MmaCore0::MmaPolicy;
+  using MmaPolicy1 = typename MmaCore1::MmaPolicy;
+
+  // Use fragment iterator for the accumulator
+  using SmemAccumulatorLayout = cutlass::layout::ColumnMajorInterleaved<16>;
+  using FragmentIteratorAccumulator = cutlass::epilogue::warp::FragmentIteratorTensorOp<
+          WarpShape0, InstructionShape,
+          ElementAccumulator,
+          typename WarpMmaTensorOp0::Policy::Operator::FragmentC,
+          SmemAccumulatorLayout
+        >;
+
+
+  // Store Accumulator tiles to Shared Memory
+  using SmemIteratorD0 = 
+      cutlass::epilogue::warp::TileIteratorTensorOp<
+          WarpShape0,
+          InstructionShape,
+          ElementC,
+          SmemAccumulatorLayout
+        >;
+
+  static int const kThreadCount = 32;
+  // load warp tile from Shared Memory accumulator
+  using WarpIteratorA1 = cutlass::gemm::warp::MmaTensorOpMultiplicandTileIteratorCanonical<
+    MatrixShape<WarpShape1::kM, InstructionShape::kK>, cutlass::gemm::Operand::kA, 
+    ElementA, SmemAccumulatorLayout,
+    MatrixShape<InstructionShape::kM, InstructionShape::kK>,
+    WarpMmaTensorOp1::Policy::OpDelta::kRow, kThreadCount>;
+ 
+  // Define the Mma
+  using B2bMma = threadblock::B2bImplicitGemmPipelinedSmemAccumulator<
+    ThreadblockShape0,
+    IteratorA0,
+    SmemIteratorA0,
+    IteratorB0,
+    SmemIteratorB0,
+    IteratorAccumulatorScaleBias,
+    FragmentIteratorAccumulator,
+    SmemIteratorD0,
+    ThreadblockShape1,
+    WarpIteratorA1,
+    IteratorB1,
+    SmemIteratorB1,
+    ElementC,
+    LayoutC,
+    EpilogueOutputOp0,
+    MmaPolicy0,
+    MmaPolicy1
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultInterleavedConvEpilogue<
+    ThreadblockShape1,
+    WarpMmaTensorOp1,
+    1,
+    EpilogueOutputOp1,
+    EpilogueOutputOp1::kCount,
+    InterleavedK
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::B2bImplicitGemmConvolution<
+    B2bMma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop_smem_accumulator_sm80.h b/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop_smem_accumulator_sm80.h
new file mode 100644
index 00000000..179c3120
--- /dev/null
+++ b/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop_smem_accumulator_sm80.h
@@ -0,0 +1,804 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+    Default kernel-level implicit GEMM convolution definitions combine threadblock-scoped 
+      matrix multiply-add with the appropriate threadblock-scoped epilogue.  
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/kernel/default_conv2d.h"
+
+#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h"
+
+#include "cutlass/transform/threadblock/predicated_vector_access_iterator.h"
+#include "cutlass/transform/threadblock/vector_iterator.h"
+#include "cutlass/transform/warp/vector_fragment_iterator.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h"
+
+#include "kernel/default_b2b_conv2d_fprop.h"
+#include "kernel/b2b_implicit_gemm_convolution.h"
+#include "threadblock/b2b_implicit_gemm_multistage_smem_accumulator.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm and multistage 
+/// pipeline.
+/// Accumulator will be staged in shared memory.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape0,
+  typename ThreadblockShape1,
+  typename WarpShape0,
+  typename WarpShape1,
+  typename InstructionShape,
+  typename EpilogueOutputOp0,
+  typename EpilogueOutputOp1,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultB2bConv2dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape0,
+  ThreadblockShape1,
+  WarpShape0,
+  WarpShape1,
+  InstructionShape,
+  EpilogueOutputOp0,
+  EpilogueOutputOp1,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  true
+> {
+
+  // Define the core components from GEMM
+  using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape0, WarpShape0, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, MathOperatorTag>;
+  using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape1, WarpShape1, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA0 = typename MmaCore0::IteratorThreadMapA;
+  using IteratorA0 =
+    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
+      ElementA, LayoutA,
+      ThreadMapA0
+    >;
+
+  using SmemIteratorA0 = typename MmaCore0::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB0 = typename MmaCore0::IteratorThreadMapB;
+  using IteratorB0 =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
+      ElementB, LayoutB,
+      ThreadMapB0
+    >;
+  
+  using SmemIteratorB0 = typename MmaCore0::SmemIteratorB;
+
+  /// Define iterators over tiles from scale/bias vectors
+  using ElementScaleBias = typename EpilogueOutputOp0::ElementCompute;
+  using LayoutScaleBias = layout::RowMajor; //vector layout doesn't really matter
+  static int const kElementsPerAccess = 2;
+  using IteratorAccumulatorScaleBias =
+    cutlass::transform::threadblock::VectorIterator<
+      cutlass::transform::threadblock::PredicatedVectorAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kN>,
+          cutlass::MatrixShape<WarpShape0::kM, WarpShape0::kN>,
+          ElementScaleBias, LayoutScaleBias, kElementsPerAccess>
+    >;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB1 = typename MmaCore1::IteratorThreadMapB;
+  using IteratorB1 =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
+      ElementB, LayoutB,
+      ThreadMapB1
+    >;
+  
+  using SmemIteratorB1 = typename MmaCore1::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp0 = typename MmaCore0::MmaTensorOp;
+  using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp;
+  using MmaPolicy0 = typename MmaCore0::MmaPolicy;
+  using MmaPolicy1 = typename MmaCore1::MmaPolicy;
+
+  // Use fragment iterator for the accumulator
+  using SmemAccumulatorLayout = cutlass::layout::RowMajor;
+  using FragmentIteratorAccumulator = cutlass::epilogue::warp::FragmentIteratorTensorOp<
+          WarpShape0, InstructionShape,
+          ElementAccumulator,
+          typename WarpMmaTensorOp0::Policy::Operator::FragmentC,
+          SmemAccumulatorLayout
+        >;
+
+  // Store Accumulator tiles to Shared Memory
+  using SmemIteratorD0 = 
+      cutlass::epilogue::warp::TileIteratorTensorOp<
+          WarpShape0,
+          InstructionShape,
+          ElementC,
+          SmemAccumulatorLayout
+        >;
+
+  static int const kThreadCount = 32;
+  // load warp tile from Shared Memory accumulator
+  using WarpIteratorA1 = cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator<
+    MatrixShape<WarpShape1::kM, InstructionShape::kK>, cutlass::gemm::Operand::kA, 
+    ElementA, SmemAccumulatorLayout,
+    MatrixShape<InstructionShape::kM, InstructionShape::kK>,
+    WarpMmaTensorOp1::Policy::OpDelta::kRow, kThreadCount>;
+ 
+  // Define the Mma
+  using B2bMma = threadblock::B2bImplicitGemmMultistageSmemAccumulator<
+    ThreadblockShape0,
+    IteratorA0,
+    SmemIteratorA0,
+    arch::CacheOperation::Always,
+    IteratorB0,
+    SmemIteratorB0,
+    arch::CacheOperation::Global,
+    IteratorAccumulatorScaleBias,
+    FragmentIteratorAccumulator,
+    SmemIteratorD0,
+    ThreadblockShape1,
+    WarpIteratorA1,
+    IteratorB1,
+    SmemIteratorB1,
+    arch::CacheOperation::Global,
+    EpilogueOutputOp0,
+    MmaPolicy0,
+    MmaPolicy1,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape1,
+    WarpMmaTensorOp1,
+    1,
+    EpilogueOutputOp1,
+    EpilogueOutputOp1::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::B2bImplicitGemmConvolution<
+    B2bMma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm and multistage 
+/// pipeline with interleaved layout.
+/// Accumulator will be staged in shared memory.
+template <
+  typename ElementA,
+  typename ElementB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape0,
+  typename ThreadblockShape1,
+  typename WarpShape0,
+  typename WarpShape1,
+  typename InstructionShape,
+  typename EpilogueOutputOp0,
+  typename EpilogueOutputOp1,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  int InterleavedK
+>
+struct DefaultB2bConv2dFprop <
+  ElementA,
+  layout::TensorNCxHWx<InterleavedK>,
+  ElementB,
+  layout::TensorCxRSKx<InterleavedK>,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape0,
+  ThreadblockShape1,
+  WarpShape0,
+  WarpShape1,
+  InstructionShape,
+  EpilogueOutputOp0,
+  EpilogueOutputOp1,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  true
+> {
+
+  // Define the core components from GEMM
+  using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape0, WarpShape0, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
+      ElementB, layout::RowMajorInterleaved<InterleavedK>,
+      ElementAccumulator, LayoutC, arch::OpClassTensorOp,
+      Stages, MathOperatorTag, true>;
+  using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape1, WarpShape1, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
+      ElementB, layout::RowMajorInterleaved<InterleavedK>,
+      ElementAccumulator, LayoutC, arch::OpClassTensorOp,
+      Stages, MathOperatorTag, true>;
+
+  // Define iterators over tiles from the A operand
+  // Note GEMM shared memory threadmap is used here because conv global memory
+  // layout needs to be mapped to fprop which is similar to the crosswise
+  // layout which is used by the interleaved GEMM shared memory threadmap.
+  // The Interleaved GEMM global memory layout is similar to the congruous
+  // layout.
+  using ThreadMapA0 = typename MmaCore0::SmemThreadMapA;
+  using IteratorA0 =
+    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
+      ElementA, layout::TensorNCxHWx<InterleavedK>,
+      ThreadMapA0
+    >;
+
+  using SmemIteratorA0 = typename MmaCore0::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  // Note GEMM shared memory threadmap is used here because conv global memory
+  // layout needs to be mapped to fprop which is similar to the crosswise
+  // layout which is used by the interleaved GEMM shared memory threadmap.
+  // The Interleaved GEMM global memory layout is similar to the congruous
+  // layout.
+  using ThreadMapB0 = typename MmaCore0::SmemThreadMapB;
+  using IteratorB0 =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
+      ElementB, layout::TensorCxRSKx<InterleavedK>,
+      ThreadMapB0
+    >;
+  
+  using SmemIteratorB0 = typename MmaCore0::SmemIteratorB;
+
+  /// Define iterators over tiles from scale/bias vectors
+  using ElementScaleBias = typename EpilogueOutputOp0::ElementCompute;
+  using LayoutScaleBias = layout::RowMajor; //vector layout doesn't really matter
+  static int const kElementsPerAccess = 4;
+  using IteratorAccumulatorScaleBias =
+    cutlass::transform::threadblock::VectorIterator<
+      cutlass::transform::threadblock::PredicatedVectorAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kN>, 
+          cutlass::MatrixShape<WarpShape0::kM, WarpShape0::kK>, 
+          ElementScaleBias, LayoutScaleBias, kElementsPerAccess>
+    >;
+
+  using ThreadMapB1 = typename MmaCore1::SmemThreadMapB;
+  using IteratorB1 =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
+      ElementB, layout::TensorCxRSKx<InterleavedK>,
+      ThreadMapB1
+    >;
+ 
+  using SmemIteratorB1 = typename MmaCore1::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp0 = typename MmaCore0::MmaTensorOp;
+  using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp;
+  using MmaPolicy0 = typename MmaCore0::MmaPolicy;
+  using MmaPolicy1 = typename MmaCore1::MmaPolicy;
+
+  // Use fragment iterator for the accumulator
+  using SmemAccumulatorLayout = cutlass::layout::ColumnMajorInterleaved<16>;
+  using FragmentIteratorAccumulator = cutlass::epilogue::warp::FragmentIteratorTensorOp<
+          WarpShape0, InstructionShape,
+          ElementAccumulator,
+          typename WarpMmaTensorOp0::Policy::Operator::FragmentC,
+          SmemAccumulatorLayout
+        >;
+
+
+  // Store Accumulator tiles to Shared Memory
+  using SmemIteratorD0 = 
+      cutlass::epilogue::warp::TileIteratorTensorOp<
+          WarpShape0,
+          InstructionShape,
+          ElementC,
+          SmemAccumulatorLayout
+        >;
+
+  static int const kThreadCount = 32;
+  // load warp tile from Shared Memory accumulator
+  using WarpIteratorA1 = cutlass::gemm::warp::MmaTensorOpMultiplicandTileIteratorCanonical<
+    MatrixShape<WarpShape1::kM, InstructionShape::kK>, cutlass::gemm::Operand::kA, 
+    ElementA, SmemAccumulatorLayout,
+    MatrixShape<InstructionShape::kM, InstructionShape::kK>,
+    WarpMmaTensorOp1::Policy::OpDelta::kRow, kThreadCount>;
+ 
+  // Define the Mma
+  using B2bMma = threadblock::B2bImplicitGemmMultistageSmemAccumulator<
+    ThreadblockShape0,
+    IteratorA0,
+    SmemIteratorA0,
+    arch::CacheOperation::Always,
+    IteratorB0,
+    SmemIteratorB0,
+    arch::CacheOperation::Global,
+    IteratorAccumulatorScaleBias,
+    FragmentIteratorAccumulator,
+    SmemIteratorD0,
+    ThreadblockShape1,
+    WarpIteratorA1,
+    IteratorB1,
+    SmemIteratorB1,
+    arch::CacheOperation::Global,
+    EpilogueOutputOp0,
+    MmaPolicy0,
+    MmaPolicy1,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultInterleavedConvEpilogue<
+    ThreadblockShape1,
+    WarpMmaTensorOp1,
+    1,
+    EpilogueOutputOp1,
+    EpilogueOutputOp1::kCount,
+    InterleavedK
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::B2bImplicitGemmConvolution<
+    B2bMma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialization for Optimized IteratorAlgorithm and 
+/// multistage pipeline.
+/// Accumulator will be staged in shared memory.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape0,
+  typename ThreadblockShape1,
+  typename WarpShape0,
+  typename WarpShape1,
+  typename InstructionShape,
+  typename EpilogueOutputOp0,
+  typename EpilogueOutputOp1,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultB2bConv2dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape0,
+  ThreadblockShape1,
+  WarpShape0,
+  WarpShape1,
+  InstructionShape,
+  EpilogueOutputOp0,
+  EpilogueOutputOp1,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  true
+> {
+
+  // Define the core components from GEMM
+  using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape0, WarpShape0, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, MathOperatorTag>;
+  using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape1, WarpShape1, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA0 = typename MmaCore0::IteratorThreadMapA;
+  using IteratorA0 =
+    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
+      ElementA, LayoutA,
+      ThreadMapA0
+    >;
+
+  using SmemIteratorA0 = typename MmaCore0::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB0 = typename MmaCore0::IteratorThreadMapB;
+  using IteratorB0 =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
+      ElementB, LayoutB,
+      ThreadMapB0
+    >;
+  
+  using SmemIteratorB0 = typename MmaCore0::SmemIteratorB;
+
+  /// Define iterators over tiles from scale/bias vectors
+  using ElementScaleBias = typename EpilogueOutputOp0::ElementCompute;
+  using LayoutScaleBias = layout::RowMajor; //vector layout doesn't really matter
+  static int const kElementsPerAccess = 2;
+  using IteratorAccumulatorScaleBias =
+    cutlass::transform::threadblock::VectorIterator<
+      cutlass::transform::threadblock::PredicatedVectorAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kN>,
+          cutlass::MatrixShape<WarpShape0::kM, WarpShape0::kN>,
+          ElementScaleBias, LayoutScaleBias, kElementsPerAccess>
+    >;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB1 = typename MmaCore1::IteratorThreadMapB;
+  using IteratorB1 =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
+      ElementB, LayoutB,
+      ThreadMapB1
+    >;
+  
+  using SmemIteratorB1 = typename MmaCore1::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp0 = typename MmaCore0::MmaTensorOp;
+  using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp;
+  using MmaPolicy0 = typename MmaCore0::MmaPolicy;
+  using MmaPolicy1 = typename MmaCore1::MmaPolicy;
+
+  // Use fragment iterator for the accumulator
+  using SmemAccumulatorLayout = cutlass::layout::RowMajor;
+  using FragmentIteratorAccumulator = cutlass::epilogue::warp::FragmentIteratorTensorOp<
+          WarpShape0, InstructionShape,
+          ElementAccumulator,
+          typename WarpMmaTensorOp0::Policy::Operator::FragmentC,
+          SmemAccumulatorLayout
+        >;
+
+  // Store Accumulator tiles to Shared Memory
+  using SmemIteratorD0 = 
+      cutlass::epilogue::warp::TileIteratorTensorOp<
+          WarpShape0,
+          InstructionShape,
+          ElementC,
+          SmemAccumulatorLayout
+        >;
+
+  static int const kThreadCount = 32;
+  // load warp tile from Shared Memory accumulator
+  using WarpIteratorA1 = cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator<
+    MatrixShape<WarpShape1::kM, InstructionShape::kK>, cutlass::gemm::Operand::kA, 
+    ElementA, SmemAccumulatorLayout,
+    MatrixShape<InstructionShape::kM, InstructionShape::kK>,
+    WarpMmaTensorOp1::Policy::OpDelta::kRow, kThreadCount>;
+ 
+  // Define the Mma
+  using B2bMma = threadblock::B2bImplicitGemmMultistageSmemAccumulator<
+    ThreadblockShape0,
+    IteratorA0,
+    SmemIteratorA0,
+    arch::CacheOperation::Always,
+    IteratorB0,
+    SmemIteratorB0,
+    arch::CacheOperation::Global,
+    IteratorAccumulatorScaleBias,
+    FragmentIteratorAccumulator,
+    SmemIteratorD0,
+    ThreadblockShape1,
+    WarpIteratorA1,
+    IteratorB1,
+    SmemIteratorB1,
+    arch::CacheOperation::Global,
+    EpilogueOutputOp0,
+    MmaPolicy0,
+    MmaPolicy1,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape1,
+    WarpMmaTensorOp1,
+    1,
+    EpilogueOutputOp1,
+    EpilogueOutputOp1::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::B2bImplicitGemmConvolution<
+    B2bMma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialization for Optimzed IteratorAlgorithm and 
+// multistage pipeline with interleaved layout.
+/// Accumulator will be staged in shared memory.
+template <
+  typename ElementA,
+  typename ElementB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape0,
+  typename ThreadblockShape1,
+  typename WarpShape0,
+  typename WarpShape1,
+  typename InstructionShape,
+  typename EpilogueOutputOp0,
+  typename EpilogueOutputOp1,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  int InterleavedK
+>
+struct DefaultB2bConv2dFprop <
+  ElementA,
+  layout::TensorNCxHWx<InterleavedK>,
+  ElementB,
+  layout::TensorCxRSKx<InterleavedK>,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape0,
+  ThreadblockShape1,
+  WarpShape0,
+  WarpShape1,
+  InstructionShape,
+  EpilogueOutputOp0,
+  EpilogueOutputOp1,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  true
+> {
+
+  // Define the core components from GEMM
+  using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape0, WarpShape0, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
+      ElementB, layout::RowMajorInterleaved<InterleavedK>,
+      ElementAccumulator, LayoutC, arch::OpClassTensorOp,
+      Stages, MathOperatorTag, true>;
+  using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape1, WarpShape1, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
+      ElementB, layout::RowMajorInterleaved<InterleavedK>,
+      ElementAccumulator, LayoutC, arch::OpClassTensorOp,
+      Stages, MathOperatorTag, true>;
+
+  // Define iterators over tiles from the A operand
+  // Note GEMM shared memory threadmap is used here because conv global memory
+  // layout needs to be mapped to fprop which is similar to the crosswise
+  // layout which is used by the interleaved GEMM shared memory threadmap.
+  // The Interleaved GEMM global memory layout is similar to the congruous
+  // layout.
+  using ThreadMapA0 = typename MmaCore0::SmemThreadMapA;
+  using IteratorA0 =
+    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
+      ElementA, layout::TensorNCxHWx<InterleavedK>,
+      ThreadMapA0
+    >;
+
+  using SmemIteratorA0 = typename MmaCore0::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  // Note GEMM shared memory threadmap is used here because conv global memory
+  // layout needs to be mapped to fprop which is similar to the crosswise
+  // layout which is used by the interleaved GEMM shared memory threadmap.
+  // The Interleaved GEMM global memory layout is similar to the congruous
+  // layout.
+  using ThreadMapB0 = typename MmaCore0::SmemThreadMapB;
+  using IteratorB0 =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
+      ElementB, layout::TensorCxRSKx<InterleavedK>,
+      ThreadMapB0
+    >;
+
+  using SmemIteratorB0 = typename MmaCore0::SmemIteratorB;
+
+  /// Define iterators over tiles from scale/bias vectors
+  using ElementScaleBias = typename EpilogueOutputOp0::ElementCompute;
+  using LayoutScaleBias = layout::RowMajor; //vector layout doesn't really matter
+  static int const kElementsPerAccess = 4;
+  using IteratorAccumulatorScaleBias =
+    cutlass::transform::threadblock::VectorIterator<
+      cutlass::transform::threadblock::PredicatedVectorAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kN>, 
+          cutlass::MatrixShape<WarpShape0::kM, WarpShape0::kN>, 
+          ElementScaleBias, LayoutScaleBias, kElementsPerAccess>
+    >;
+
+  using ThreadMapB1 = typename MmaCore1::SmemThreadMapB;
+  using IteratorB1 =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
+      ElementB, layout::TensorCxRSKx<InterleavedK>,
+      ThreadMapB1
+    >;
+ 
+  using SmemIteratorB1 = typename MmaCore1::SmemIteratorB;
+
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp0 = typename MmaCore0::MmaTensorOp;
+  using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp;
+  using MmaPolicy0 = typename MmaCore0::MmaPolicy;
+  using MmaPolicy1 = typename MmaCore1::MmaPolicy;
+
+  // Use fragment iterator for the accumulator
+  using SmemAccumulatorLayout = cutlass::layout::ColumnMajorInterleaved<16>;
+  using FragmentIteratorAccumulator = cutlass::epilogue::warp::FragmentIteratorTensorOp<
+          WarpShape0, InstructionShape,
+          ElementAccumulator,
+          typename WarpMmaTensorOp0::Policy::Operator::FragmentC,
+          SmemAccumulatorLayout
+        >;
+
+
+  // Store Accumulator tiles to Shared Memory
+  using SmemIteratorD0 = 
+      cutlass::epilogue::warp::TileIteratorTensorOp<
+          WarpShape0,
+          InstructionShape,
+          ElementC,
+          SmemAccumulatorLayout
+        >;
+
+  static int const kThreadCount = 32;
+  // load warp tile from Shared Memory accumulator
+  using WarpIteratorA1 = cutlass::gemm::warp::MmaTensorOpMultiplicandTileIteratorCanonical<
+    MatrixShape<WarpShape1::kM, InstructionShape::kK>, cutlass::gemm::Operand::kA, 
+    ElementA, SmemAccumulatorLayout,
+    MatrixShape<InstructionShape::kM, InstructionShape::kK>,
+    WarpMmaTensorOp1::Policy::OpDelta::kRow, kThreadCount>;
+ 
+  // Define the Mma
+  using B2bMma = threadblock::B2bImplicitGemmMultistageSmemAccumulator<
+    ThreadblockShape0,
+    IteratorA0,
+    SmemIteratorA0,
+    arch::CacheOperation::Always,
+    IteratorB0,
+    SmemIteratorB0,
+    arch::CacheOperation::Global,
+    IteratorAccumulatorScaleBias,
+    FragmentIteratorAccumulator,
+    SmemIteratorD0,
+    ThreadblockShape1,
+    WarpIteratorA1,
+    IteratorB1,
+    SmemIteratorB1,
+    arch::CacheOperation::Global,
+    EpilogueOutputOp0,
+    MmaPolicy0,
+    MmaPolicy1,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultInterleavedConvEpilogue<
+    ThreadblockShape1,
+    WarpMmaTensorOp1,
+    1,
+    EpilogueOutputOp1,
+    EpilogueOutputOp1::kCount,
+    InterleavedK
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::B2bImplicitGemmConvolution<
+    B2bMma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/13_two_tensor_op_fusion/kernel/default_b2b_gemm.h b/examples/13_two_tensor_op_fusion/kernel/default_b2b_gemm.h
index fe050f03..4d807b9d 100644
--- a/examples/13_two_tensor_op_fusion/kernel/default_b2b_gemm.h
+++ b/examples/13_two_tensor_op_fusion/kernel/default_b2b_gemm.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -111,7 +117,9 @@ template <
   /// If true, kernel is configured to support serial reduction in the epilogue
   bool SplitKSerial,
   /// Operation performed by GEMM
-  typename Operator
+  typename Operator,
+  /// Stage accumulator in shared memory
+  bool SmemAccumulator = false
 >
 struct DefaultB2bGemm;
 
diff --git a/examples/13_two_tensor_op_fusion/kernel/default_b2b_gemm_smem_accumulator.h b/examples/13_two_tensor_op_fusion/kernel/default_b2b_gemm_smem_accumulator.h
new file mode 100644
index 00000000..fcff8672
--- /dev/null
+++ b/examples/13_two_tensor_op_fusion/kernel/default_b2b_gemm_smem_accumulator.h
@@ -0,0 +1,397 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+      Default kernel-level GEMM definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+  
+      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
+      accommodated by exchanging A and B operands and assuming transposed layouts. Partial
+      specializations here choose 'device::GemmTransposed' to implement this functionality.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/gemm_pipelined.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+#include "cutlass/transform/threadblock/vector_iterator.h"
+#include "cutlass/transform/threadblock/predicated_vector_access_iterator.h"
+
+#include "kernel/b2b_gemm.h"
+#include "threadblock/default_b2b_mma.h"
+#include "threadblock/default_b2b_mma_smem_accumulator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape0,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape1,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape0,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape1,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp0,
+    /// Epilogue output operator
+    typename EpilogueOutputOp1,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultB2bGemm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementC,
+                   layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
+                   arch::Sm80, ThreadblockShape0, ThreadblockShape1,
+                   WarpShape0, WarpShape1, InstructionShape,
+                   EpilogueOutputOp0, EpilogueOutputOp1, ThreadblockSwizzle, Stages, SplitKSerial,
+                   Operator, true> {
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using B2bMma = typename cutlass::gemm::threadblock::DefaultB2bMma<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80,
+      ThreadblockShape0, ThreadblockShape1, WarpShape0, WarpShape1, 
+      InstructionShape, Stages, Operator, EpilogueOutputOp0, false, true>::ThreadblockB2bMma;
+
+  static const int kPartitionsK1 = ThreadblockShape1::kK / WarpShape1::kK;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+          ThreadblockShape1, typename B2bMma::Operator1, kPartitionsK1, EpilogueOutputOp1,
+          EpilogueOutputOp1::kCount>::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using B2bGemmKernel = kernel::B2bGemm<B2bMma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Turing Architecture
+template <
+  /// Element type for A matrix operand
+  typename ElementA,
+  /// Layout type for A matrix operand
+  typename LayoutA,
+  /// Access granularity of A matrix in units of elements
+  int kAlignmentA,
+  /// Element type for B matrix operand
+  typename ElementB,
+  /// Layout type for B matrix operand
+  typename LayoutB,
+  /// Access granularity of B matrix in units of elements
+  int kAlignmentB,
+  /// Element type for C and D matrix operands
+  typename ElementC,
+  /// Element type for internal accumulation
+  typename ElementAccumulator,
+  /// Threadblock-level tile size (concept: GemmShape)
+  typename ThreadblockShape0,
+  /// Threadblock-level tile size (concept: GemmShape)
+  typename ThreadblockShape1,
+  /// Warp-level tile size (concept: GemmShape)
+  typename WarpShape0,
+  /// Warp-level tile size (concept: GemmShape)
+  typename WarpShape1,
+  /// Warp-level tile size (concept: GemmShape)
+  typename InstructionShape,
+  /// Epilogue output operator
+  typename EpilogueOutputOp0,
+  /// Epilogue output operator
+  typename EpilogueOutputOp1,
+  /// Threadblock-level swizzling operator
+  typename ThreadblockSwizzle,
+  /// If true, kernel is configured to support serial reduction in the epilogue
+  bool SplitKSerial,
+  /// Operation performed by GEMM
+  typename Operator
+>
+struct DefaultB2bGemm<
+  ElementA, LayoutA, kAlignmentA,
+  ElementB, LayoutB, kAlignmentB,
+  ElementC, layout::RowMajor,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  arch::Sm75,
+  ThreadblockShape0,
+  ThreadblockShape1,
+  WarpShape0,
+  WarpShape1,
+  InstructionShape,
+  EpilogueOutputOp0,
+  EpilogueOutputOp1,
+  ThreadblockSwizzle,
+  2,
+  SplitKSerial,
+  Operator,
+  true
+> {
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using B2bMma = typename cutlass::gemm::threadblock::DefaultB2bMma<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    ElementAccumulator,
+    layout::RowMajor,
+    arch::OpClassTensorOp,
+    arch::Sm75,
+    ThreadblockShape0,
+    ThreadblockShape1,
+    WarpShape0,
+    WarpShape1,
+    InstructionShape,
+    2,
+    Operator,
+    EpilogueOutputOp0,
+    false,
+    true
+  >::ThreadblockB2bMma;
+  
+  static const int kPartitionsK1 = ThreadblockShape1::kK / WarpShape1::kK;
+
+  /// Define the epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape1,
+    typename B2bMma::Operator1,
+    kPartitionsK1,
+    EpilogueOutputOp1,
+    EpilogueOutputOp1::kCount
+  >::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using B2bGemmKernel = kernel::B2bGemm<B2bMma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+};
+
+
+/// Partial specialization for Ampere Integer Matrix Multiply Interleaved layout
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape0,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape1,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape0,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape1,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp0,
+    /// Epilogue output operator
+    typename EpilogueOutputOp1,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Number of Interleaved k
+    int InterleavedK,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultB2bGemm<
+    ElementA, layout::ColumnMajorInterleaved<InterleavedK>, kAlignmentA,
+    ElementB, layout::RowMajorInterleaved<InterleavedK>, kAlignmentB, 
+    ElementC, layout::ColumnMajorInterleaved<InterleavedK>, int32_t,
+    arch::OpClassTensorOp, arch::Sm80,
+    ThreadblockShape0, ThreadblockShape1, WarpShape0, WarpShape1,
+    InstructionShape, EpilogueOutputOp0, EpilogueOutputOp1,
+    ThreadblockSwizzle, Stages,
+    SplitKSerial, Operator, true> {
+  using LayoutA = layout::ColumnMajorInterleaved<InterleavedK>;
+  using LayoutB = layout::RowMajorInterleaved<InterleavedK>;
+  using LayoutC = layout::ColumnMajorInterleaved<InterleavedK>;
+
+  using ElementAccumulator = int32_t;
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using B2bMma = typename cutlass::gemm::threadblock::DefaultB2bMma<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+      ElementAccumulator, LayoutC, arch::OpClassTensorOp, arch::Sm80,
+      ThreadblockShape0, ThreadblockShape1, WarpShape0, WarpShape1,
+      InstructionShape, Stages, Operator, EpilogueOutputOp0,
+      true, true>::ThreadblockB2bMma;
+
+  static const int kPartitionsK1 = ThreadblockShape1::kK / WarpShape1::kK;
+
+  /// Define the epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::
+      DefaultInterleavedEpilogueTensorOp<
+          ThreadblockShape1, typename B2bMma::Operator1, kPartitionsK1, EpilogueOutputOp1,
+          64 / sizeof_bits<ElementC>::value, InterleavedK>::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using B2bGemmKernel = kernel::B2bGemm<B2bMma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+/// Partial specialization for Turing Integer Tensor Core Interleaved layout
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape0,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape1,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape0,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape1,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp0,
+    /// Epilogue output operator
+    typename EpilogueOutputOp1,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of Interleaved k
+    int InterleavedK,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultB2bGemm<ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
+                   kAlignmentA, ElementB,
+                   layout::RowMajorInterleaved<InterleavedK>, kAlignmentB,
+                   ElementC, layout::ColumnMajorInterleaved<InterleavedK>,
+                   int32_t, arch::OpClassTensorOp, arch::Sm75, 
+                   ThreadblockShape0, ThreadblockShape1, WarpShape0, WarpShape1,
+                   InstructionShape, EpilogueOutputOp0, EpilogueOutputOp1,
+                   ThreadblockSwizzle, 2, SplitKSerial, Operator, true> {
+  using LayoutA = layout::ColumnMajorInterleaved<InterleavedK>;
+  using LayoutB = layout::RowMajorInterleaved<InterleavedK>;
+  using LayoutC = layout::ColumnMajorInterleaved<InterleavedK>;
+
+  using ElementAccumulator = int32_t;
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using B2bMma = typename cutlass::gemm::threadblock::DefaultB2bMma<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, 
+      ElementAccumulator, LayoutC, arch::OpClassTensorOp, arch::Sm75, 
+      ThreadblockShape0, ThreadblockShape1, WarpShape0, WarpShape1, 
+      InstructionShape, 2, Operator, EpilogueOutputOp0, true, true>::ThreadblockB2bMma;
+
+  static const int kPartitionsK1 = ThreadblockShape1::kK / WarpShape1::kK;
+
+  /// Define the epilogue for the 2nd Gemm
+  using Epilogue = typename cutlass::epilogue::threadblock::
+      DefaultInterleavedEpilogueTensorOp<
+          ThreadblockShape1, typename B2bMma::Operator1, kPartitionsK1, EpilogueOutputOp1,
+          64 / sizeof_bits<ElementC>::value, InterleavedK>::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using B2bGemmKernel = kernel::B2bGemm<B2bMma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/examples/13_two_tensor_op_fusion/test_run.h b/examples/13_two_tensor_op_fusion/test_run.h
new file mode 100644
index 00000000..b14becaf
--- /dev/null
+++ b/examples/13_two_tensor_op_fusion/test_run.h
@@ -0,0 +1,95 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+#include <iostream>
+
+// Run tests on GPUs 
+
+int testRun(int arch, std::vector<bool (*)()> & test_funcs, const std::string & test_name) {
+
+  bool supported = false;
+
+  int arch_major = arch / 10;
+  int arch_minor = arch - arch / 10 * 10;  
+
+  if(arch_major >= 8) {
+    // Ampere Tensor Core operations exposed with mma.sync are first available in CUDA 11.0.
+    //
+    // CUTLASS must be compiled with CUDA 11 Toolkit to run Conv2dFprop examples.
+    if (__CUDACC_VER_MAJOR__ > 11 || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 0)) {
+      supported = true;
+    }
+  }
+  else if(arch_major >= 7) {
+    // Turing Tensor Core operations exposed with mma.sync are first available in CUDA 10.2.
+    //
+    // CUTLASS must be compiled with CUDA 10.2 Toolkit to run these examples.
+    if (__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2)) {
+      supported = true;
+    }
+  }
+
+  cudaDeviceProp props;
+
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
+  }
+
+  if (!(props.major == arch_major && props.minor == arch_minor)) {
+    supported = false;
+  }
+
+  if (!supported) {
+    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
+    std::cout << "This example isn't supported on current architecture" << std::endl;
+    return 0;
+  }
+
+  bool pass = true;
+ 
+  std::cout << "Device: " << props.name << std::endl;
+  std::cout << "Arch: SM" << arch << std::endl;
+  std::cout << "Test: " << test_name << std::endl;
+  for(auto func : test_funcs) {
+    pass &= func();
+  }
+
+
+  if(pass)
+    return 0;
+  else
+    return -1;
+
+}
+
diff --git a/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_multistage.h b/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_multistage.h
index aebc87dd..228bff27 100644
--- a/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_multistage.h
+++ b/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_multistage.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -693,7 +699,7 @@ class B2bImplicitGemmMultistage :
     // Mainloop
     //
 
-    CUTLASS_GEMM_LOOP
+    CUTLASS_PRAGMA_UNROLL
     for (gemm_k_iterations_1 = FragmentIteratorA1::Policy::kIterations / Base::kWarpGemmIterations1 - (Base::kStages - 1); 
             gemm_k_iterations_1 > (-Base::kStages + 1); gemm_k_iterations_1--) {
       //
diff --git a/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_multistage_smem_accumulator.h b/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_multistage_smem_accumulator.h
new file mode 100644
index 00000000..ab08c005
--- /dev/null
+++ b/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_multistage_smem_accumulator.h
@@ -0,0 +1,816 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a multistage threadblock-scoped Implicit GEMM Convolution kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/cache_operation.h"
+#include "cutlass/gemm/threadblock/mma_base.h"
+#include "cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h"
+
+#include "threadblock/b2b_mma_base_smem_accumulator.h"
+#include "cutlass/epilogue/threadblock/epilogue_smem_accumulator.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape0_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA0_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA0_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA0,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB0_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB0_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB0,
+    /// Iterates over vectors of scale and bias vector in global memory
+    //  (concept: VectorIterator)
+    typename IteratorAccumulatorScaleBias_,
+    /// Iterates over accumulator tile
+    typename FragmentIteratorAccumulator_,
+    /// Iterates over accumulator tile in shared memory
+    typename SmemIteratorD0_,
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape1_,
+    /// Iterates over the intermediate accumulator tile
+    //  (concept::MmaTensorOpFragmentIterator) 
+    typename WarpIteratorA1_,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB1_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB1_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB1,
+    /// Output operator for 1st Gemm(concept: epilogue::thread::LinearCombinationClamp, etc...) 
+    typename OutputOp_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy0_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy1_,
+    /// Number of stages,
+    int Stages,
+    /// Used for partial specialization
+    typename Enable = bool>
+class B2bImplicitGemmMultistageSmemAccumulator : 
+  public gemm::threadblock::B2bMmaBaseSmemAccumulator<Shape0_, Shape1_, Policy0_, Policy1_, SmemIteratorD0_, Stages> {
+public:
+  ///< Base class
+  using Base = gemm::threadblock::B2bMmaBaseSmemAccumulator<Shape0_, Shape1_, Policy0_, Policy1_, SmemIteratorD0_, Stages>;
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape0 = Shape0_;
+  ///< Iterates over tiles of A operand in global memory
+  using IteratorA0 = IteratorA0_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB0 = IteratorB0_;
+  ///< Iterates over tiles of the scale and bias vectors in global memory
+  using IteratorAccumulatorScaleBias = IteratorAccumulatorScaleBias_;
+  ///< Policy describing tuning details
+  using Policy0 = Policy0_;
+
+  using SmemIteratorA0 = SmemIteratorA0_;
+  using SmemIteratorB0 = SmemIteratorB0_;
+  using SmemIteratorD0 = SmemIteratorD0_; ///< Iterates over accumulator tile in shared memory
+
+  using FragmentIteratorAccumulator = FragmentIteratorAccumulator_;  ///< Iterates over accumulator tile
+
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape1 = Shape1_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB1 = IteratorB1_;
+  ///< Policy describing tuning details
+  using Policy1 = Policy1_;
+
+  using SmemIteratorB1 = SmemIteratorB1_;
+  using WarpIteratorA1 = WarpIteratorA1_;   ///< Iterates over the intermediate accumulator tile in shared memory
+
+  ///< Epilogue after 1st Gemm
+  using OutputOp = OutputOp_;
+  
+  static const bool PerChannelScale = (OutputOp::kScale ==
+      epilogue::thread::ScaleType::OnlyAlphaPerChannelScaling);
+ 
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA0 = CacheOpA0;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB0 = CacheOpB0;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB1 = CacheOpB1;
+
+  //
+  // Dependent types
+  //
+
+  using ElementC = typename Policy0::Operator::ElementC;
+
+  /// Fragment of accumulator tile
+  using FragmentC0 = typename Policy0::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator0 = typename Policy0::Operator;
+  
+  /// Fragment of Scale and Bias loaded from global memory
+  using FragmentA1ScaleBias = typename IteratorAccumulatorScaleBias::Fragment;
+
+  /// Fragment of accumulator tile
+  using FragmentC1 = typename Policy1::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator1 = typename Policy1::Operator;
+
+  /// Epilog in shared memory
+  using Epilogue0 = epilogue::threadblock::EpilogueSmemAccumulator<
+    SmemIteratorD0,                 ///< SmemTileIterator
+    FragmentIteratorAccumulator,    ///< AccumulatorFragmentIterator
+    IteratorAccumulatorScaleBias,   ///< ScaleBiasIterator
+    OutputOp>;                      ///< Output operator
+  
+  /// Internal structure exposed for introspection.
+  struct Detail {
+
+    static_assert(Base::kWarpGemmIterations0 > 1,
+                  "The pipelined structure requires at least two warp-level "
+                  "GEMM operations.");
+    static_assert(Base::kWarpGemmIterations1 > 1,
+                  "The pipelined structure requires at least two warp-level "
+                  "GEMM operations.");
+
+    /// Number of cp.async instructions to load one stage of operand A
+    static int const AsyncCopyIterationsPerStageA0 =
+        IteratorA0::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const AsyncCopyIterationsPerStageB0 =
+        IteratorB0::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const AsyncCopyIterationsPerStageB1 =
+        IteratorB1::ThreadMap::Iterations::kCount;
+
+    /// Number of stages
+    static int const kStages = Stages;
+
+    /// Number of cp.async instructions to load on group of operand A
+    static int const kAccessesPerGroupA0 =
+        (AsyncCopyIterationsPerStageA0 + Base::kWarpGemmIterations0 - 1) / Base::kWarpGemmIterations0;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB0 =
+        (AsyncCopyIterationsPerStageB0 + Base::kWarpGemmIterations0 - 1) / Base::kWarpGemmIterations0;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB1 =
+        (AsyncCopyIterationsPerStageB1 + Base::kWarpGemmIterations1 - 1) / Base::kWarpGemmIterations1;
+  };
+
+ private:
+
+  using WarpLoadedFragmentA0 = typename Operator0::FragmentA;
+  using WarpLoadedFragmentB0 = typename Operator0::FragmentB;
+  using WarpLoadedFragmentA1 = typename Operator1::FragmentA;
+  using WarpLoadedFragmentB1 = typename Operator1::FragmentB;
+  using WarpTransformedFragmentA0 = typename Operator0::TransformedFragmentA;
+  using WarpTransformedFragmentB0 = typename Operator0::TransformedFragmentB;
+  using WarpTransformedFragmentA1 = typename Operator1::TransformedFragmentA;
+  using WarpTransformedFragmentB1 = typename Operator1::TransformedFragmentB;
+
+ private:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA0 smem_iterator_A0_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB0 smem_iterator_B0_;
+
+  /// Shared Memory Iterator to store accumulator tile
+  SmemIteratorD0 smem_iterator_D0_;
+    
+  /// Iterator to load a warp-scoped tile of A1 operand from intermediate accumulator tile
+  WarpIteratorA1 warp_tile_iterator_A1_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB1 smem_iterator_B1_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  B2bImplicitGemmMultistageSmemAccumulator(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::B2bMmaSharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx
+    ):
+      Base(shared_storage, thread_idx, warp_idx, lane_idx),
+      smem_iterator_A0_(shared_storage.b2b_mma_shared_storage.shared_storage0.operand_A_ref(), thread_idx),
+      smem_iterator_B0_(shared_storage.b2b_mma_shared_storage.shared_storage0.operand_B_ref(), thread_idx),
+      smem_iterator_D0_(shared_storage.accumulator_shared_storage0.accum_ref(), lane_idx),
+      warp_tile_iterator_A1_(shared_storage.accumulator_shared_storage0.accum_ref(), lane_idx),
+      smem_iterator_B1_(shared_storage.b2b_mma_shared_storage.shared_storage1.operand_B_ref(), thread_idx)
+  {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn_0 = warp_idx % (Base::WarpCount0::kM * Base::WarpCount0::kN);
+    int warp_idx_k_0 = warp_idx / (Base::WarpCount0::kM * Base::WarpCount0::kN);
+
+    int warp_idx_m_0 = warp_idx_mn_0 % Base::WarpCount0::kM;
+    int warp_idx_n_0 = warp_idx_mn_0 / Base::WarpCount0::kM;
+
+    int warp_idx_mn_1 = warp_idx % (Base::WarpCount1::kM * Base::WarpCount1::kN);
+    int warp_idx_k_1 = warp_idx / (Base::WarpCount1::kM * Base::WarpCount1::kN);
+
+    int warp_idx_m_1 = warp_idx_mn_1 % Base::WarpCount1::kM;
+    int warp_idx_n_1 = warp_idx_mn_1 / Base::WarpCount1::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A0_.add_tile_offset(
+        {warp_idx_m_0, Base::kWarpGemmIterations0 * warp_idx_k_0});
+    this->warp_tile_iterator_B0_.add_tile_offset(
+        {Base::kWarpGemmIterations0 * warp_idx_k_0, warp_idx_n_0});
+    warp_tile_iterator_A1_.add_tile_offset(
+        {warp_idx_m_1, Base::kWarpGemmIterations1 * warp_idx_k_1});
+    this->warp_tile_iterator_B1_.add_tile_offset(
+        {Base::kWarpGemmIterations1 * warp_idx_k_1, warp_idx_n_1});
+
+    // Add smem accumulator iterator warp offset
+    smem_iterator_D0_.add_tile_offset({ warp_idx_m_0 * SmemIteratorD0::TileIterations::kRow, 
+                                        warp_idx_n_0 * SmemIteratorD0::TileIterations::kColumn});
+  }
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance_0(
+    IteratorA0 &iterator_A0, IteratorB0 &iterator_B0,
+    int group_start_A0 = 0, int group_start_B0 = 0) {
+
+    iterator_A0.set_iteration_index(group_start_A0);
+    this->smem_iterator_A0_.set_iteration_index(group_start_A0);
+      
+    // Async Copy for operand A
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupA0; ++j) {
+
+      if (group_start_A0 + j < Detail::AsyncCopyIterationsPerStageA0) {
+        typename IteratorA0::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA0::AccessType *>(
+                this->smem_iterator_A0_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorA0::Element>::value *
+                              IteratorA0::ThreadMap::kElementsPerAccess / 8;
+
+        cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA0>(
+                dst_ptr, iterator_A0.get(), iterator_A0.valid());
+
+        ++iterator_A0;
+
+        ++this->smem_iterator_A0_;
+      }
+    }
+
+    iterator_B0.set_iteration_index(group_start_B0);
+
+    this->smem_iterator_B0_.set_iteration_index(group_start_B0);
+    
+    // Async Copy for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB0; ++j) {
+      if (group_start_B0 + j < Detail::AsyncCopyIterationsPerStageB0) {
+        typename IteratorB0::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB0::AccessType *>(
+                this->smem_iterator_B0_.get());
+        
+        int const kSrcBytes = sizeof_bits<typename IteratorB0::Element>::value *
+                              IteratorB0::ThreadMap::kElementsPerAccess / 8;
+
+        cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB0>(
+                dst_ptr, iterator_B0.get(), iterator_B0.valid());
+
+        ++iterator_B0;
+        ++this->smem_iterator_B0_;
+      }
+    }
+  }
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance_1(
+    IteratorB1 &iterator_B1,
+    int group_start_B1 = 0) {
+
+    iterator_B1.set_iteration_index(group_start_B1);
+
+    this->smem_iterator_B1_.set_iteration_index(group_start_B1);
+    
+    // Async Copy for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB1; ++j) {
+      if (group_start_B1 + j < Detail::AsyncCopyIterationsPerStageB1) {
+        typename IteratorB1::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB1::AccessType *>(
+                this->smem_iterator_B1_.get());
+        
+        int const kSrcBytes = sizeof_bits<typename IteratorB1::Element>::value *
+                              IteratorB1::ThreadMap::kElementsPerAccess / 8;
+
+        cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB1>(
+                dst_ptr, iterator_B1.get(), iterator_B1.valid());
+
+        ++iterator_B1;
+        ++this->smem_iterator_B1_;
+      }
+    }
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations_0,
+      ///< destination accumulator tile
+      FragmentC1 &accum,
+      ///< iterator over A0 operand in global memory
+      IteratorA0 iterator_A0,
+      ///< iterator over B0 operand in global memory
+      IteratorB0 iterator_B0,
+      ///< iterator over A1 operand scale vector in global memory
+      IteratorAccumulatorScaleBias iterator_accum0_scale,
+      ///< iterator over A1 operand bias vector in global memory
+      IteratorAccumulatorScaleBias iterator_accum0_bias,
+      ///< iterator over B1 operand in global memory
+      IteratorB1 iterator_B1,
+      ///< initial value of accumulator
+      FragmentC0 const &src_accum,
+      ///< epilogue operation after 1st Gemm
+      OutputOp output_op_0,
+      ///< Imaginary strides used for planar-complex only - ignored here
+      int64_t imag_stride_A = 0,
+      int64_t imag_stride_B = 0) {
+
+    //
+    // Prologue
+    //
+
+    // Issue several complete stages
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < Base::kStages - 1;
+         ++stage, --gemm_k_iterations_0) {
+
+      iterator_A0.set_iteration_index(0);
+      this->smem_iterator_A0_.set_iteration_index(0);
+
+      // Async Copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA0; ++j) {
+        typename IteratorA0::AccessType *dst_ptr =
+          reinterpret_cast<typename IteratorA0::AccessType *>(
+            this->smem_iterator_A0_.get());
+
+        int const kSrcBytes =
+            sizeof_bits<typename IteratorA0::Element>::value *
+            IteratorA0::ThreadMap::kElementsPerAccess / 8;
+
+        cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA0>(
+          dst_ptr, iterator_A0.get(), iterator_A0.valid());
+
+        ++iterator_A0;
+        ++this->smem_iterator_A0_;
+      }
+
+      iterator_B0.set_iteration_index(0);
+      this->smem_iterator_B0_.set_iteration_index(0);
+
+      // Async Copy for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB0; ++j) {
+        typename IteratorB0::AccessType *dst_ptr =
+          reinterpret_cast<typename IteratorB0::AccessType *>(
+              this->smem_iterator_B0_.get());
+
+        int const kSrcBytes =
+            sizeof_bits<typename IteratorB0::Element>::value *
+            IteratorB0::ThreadMap::kElementsPerAccess / 8;
+
+        cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB0>(
+            dst_ptr, iterator_B0.get(), iterator_B0.valid());
+
+        ++iterator_B0;
+        ++this->smem_iterator_B0_;
+      }
+
+      // Move to the next stage
+      iterator_A0.advance();
+      iterator_B0.advance();
+
+      this->smem_iterator_A0_.add_tile_offset({0, 1});
+      this->smem_iterator_B0_.add_tile_offset({1, 0});
+
+      // Inserts a fence to group cp.async instructions into stages.
+      cutlass::arch::cp_async_fence();
+    }
+
+    // Perform accumulation in the 'd' output operand
+    FragmentC0 accum0 = src_accum;
+
+    // Waits until kStages-2 stages have committed. 
+    cutlass::arch::cp_async_wait<Base::kStages - 2>();
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpLoadedFragmentA0 warp_loaded_frag_A0[2];
+    WarpLoadedFragmentB0 warp_loaded_frag_B0[2];
+    WarpTransformedFragmentA0 warp_transformed_frag_A0[2];
+    WarpTransformedFragmentB0 warp_transformed_frag_B0[2];
+
+    Operator0 warp_mma0;
+
+    this->warp_tile_iterator_A0_.set_kgroup_index(0);
+    this->warp_tile_iterator_B0_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A0_.load(warp_loaded_frag_A0[0]);
+    this->warp_tile_iterator_B0_.load(warp_loaded_frag_B0[0]);
+
+    ++this->warp_tile_iterator_A0_;
+    ++this->warp_tile_iterator_B0_;
+
+    // Start issuing the first group of the next stage outside of the mainloop
+    copy_tiles_and_advance_0(iterator_A0, iterator_B0);
+
+    int smem_write_stage_idx = Base::kStages - 1;
+    int smem_read_stage_idx = 0;
+
+    warp_mma0.transform(warp_transformed_frag_A0[0], warp_transformed_frag_B0[0],
+                       warp_loaded_frag_A0[0], warp_loaded_frag_B0[0]);
+
+    //
+    // Mainloop
+    //
+
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations_0 > (-Base::kStages + 1);) {
+    
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations0;
+           ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+
+        this->warp_tile_iterator_A0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0);
+        this->warp_tile_iterator_B0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0);
+        
+        this->warp_tile_iterator_A0_.load(warp_loaded_frag_A0[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_B0_.load(warp_loaded_frag_B0[(warp_mma_k + 1) % 2]);
+
+        ++this->warp_tile_iterator_A0_;
+        ++this->warp_tile_iterator_B0_;
+
+        if (warp_mma_k > 0)
+          warp_mma0.transform(warp_transformed_frag_A0[warp_mma_k % 2],
+                             warp_transformed_frag_B0[warp_mma_k % 2],
+                             warp_loaded_frag_A0[warp_mma_k % 2],
+                             warp_loaded_frag_B0[warp_mma_k % 2]);
+
+        // Issue global->shared copies for the next stage
+        int group_start_iteration_A0, group_start_iteration_B0;
+
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations0) {
+          group_start_iteration_A0 = 0;
+          group_start_iteration_B0 = 0;
+        } else {
+          group_start_iteration_A0 =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupA0;
+          group_start_iteration_B0 =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupB0;
+        }
+
+        copy_tiles_and_advance_0(iterator_A0, iterator_B0, group_start_iteration_A0,
+                               group_start_iteration_B0);
+
+        warp_mma0(
+                accum0, 
+                warp_transformed_frag_A0[warp_mma_k % 2],
+                warp_transformed_frag_B0[warp_mma_k % 2], 
+                accum0
+                );
+
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations0)
+          warp_mma0.transform(warp_transformed_frag_A0[(warp_mma_k + 1) % 2],
+                             warp_transformed_frag_B0[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_A0[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_B0[(warp_mma_k + 1) % 2]);
+
+        if (warp_mma_k + 2 == Base::kWarpGemmIterations0) {
+          // Inserts a fence to group cp.async instructions into stages.
+          cutlass::arch::cp_async_fence();
+
+          // Waits until kStages-2 stages of cp.async have committed
+          arch::cp_async_wait<Base::kStages - 2>();
+          __syncthreads();
+
+          // Move to the next stage
+          iterator_A0.advance();
+          iterator_B0.advance();
+
+          this->smem_iterator_A0_.add_tile_offset({0, 1});
+          this->smem_iterator_B0_.add_tile_offset({1, 0});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (smem_write_stage_idx == (Base::kStages - 1)) {
+            this->smem_iterator_A0_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B0_.add_tile_offset({-Base::kStages, 0});
+            smem_write_stage_idx = 0;
+          } else {
+            ++smem_write_stage_idx;
+          }
+
+          if (smem_read_stage_idx == (Base::kStages - 1)) {
+            this->warp_tile_iterator_A0_.add_tile_offset(
+                {0, -Base::kStages * Policy0::kPartitionsK *
+                        Base::kWarpGemmIterations0});
+            this->warp_tile_iterator_B0_.add_tile_offset(
+                {-Base::kStages * Policy0::kPartitionsK *
+                     Base::kWarpGemmIterations0,
+                 0});
+            smem_read_stage_idx = 0;
+          } else {
+            ++smem_read_stage_idx;
+          }
+
+          --gemm_k_iterations_0;
+        }
+      }
+
+    }
+
+    // Insert fence and wait for all outstanding cp.async operations to commit.
+    cutlass::arch::cp_async_fence();
+    cutlass::arch::cp_async_wait<0>();
+    __syncthreads();
+
+    /// Epilogue for the first Implicit Gemm
+    Epilogue0 epilogue0;
+
+    epilogue0(output_op_0, smem_iterator_D0_, accum0, iterator_accum0_scale, iterator_accum0_bias);
+
+    __syncthreads();
+
+    // 2nd Implicit Gemm
+
+    //
+    // Prologue
+    //
+    int gemm_k_iterations_1 = Shape0::kN / Shape1::kK;
+
+    // Issue several complete stages
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < Base::kStages - 1;
+         ++stage, --gemm_k_iterations_1) {
+
+      iterator_B1.set_iteration_index(0);
+      this->smem_iterator_B1_.set_iteration_index(0);
+
+      // Async Copy for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB1; ++j) {
+        typename IteratorB1::AccessType *dst_ptr =
+          reinterpret_cast<typename IteratorB1::AccessType *>(
+              this->smem_iterator_B1_.get());
+
+        int const kSrcBytes =
+            sizeof_bits<typename IteratorB1::Element>::value *
+            IteratorB1::ThreadMap::kElementsPerAccess / 8;
+
+        cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB1>(
+            dst_ptr, iterator_B1.get(), iterator_B1.valid());
+
+        ++iterator_B1;
+        ++this->smem_iterator_B1_;
+      }
+
+      // Move to the next stage
+      iterator_B1.advance();
+
+      this->smem_iterator_B1_.add_tile_offset({1, 0});
+
+      // Inserts a fence to group cp.async instructions into stages.
+      cutlass::arch::cp_async_fence();
+    }
+
+    // Waits until kStages-2 stages have committed. 
+    cutlass::arch::cp_async_wait<Base::kStages - 2>();
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpLoadedFragmentA1 warp_loaded_frag_A1[2];
+    WarpLoadedFragmentB1 warp_loaded_frag_B1[2];
+    WarpTransformedFragmentA1 warp_transformed_frag_A1[2];
+    WarpTransformedFragmentB1 warp_transformed_frag_B1[2];
+
+    Operator1 warp_mma1;
+
+    warp_tile_iterator_A1_.load(warp_loaded_frag_A1[0]);
+    ++warp_tile_iterator_A1_;
+
+    this->warp_tile_iterator_B1_.set_kgroup_index(0);
+    this->warp_tile_iterator_B1_.load(warp_loaded_frag_B1[0]);
+    ++this->warp_tile_iterator_B1_;
+
+    // Start issuing the first group of the next stage outside of the mainloop
+    copy_tiles_and_advance_1(iterator_B1);
+
+    smem_write_stage_idx = Base::kStages - 1;
+    smem_read_stage_idx = 0;
+
+    warp_mma1.transform(warp_transformed_frag_A1[0], warp_transformed_frag_B1[0],
+                       warp_loaded_frag_A1[0], warp_loaded_frag_B1[0]);
+
+
+    //
+    // Mainloop
+    //
+
+    CUTLASS_PRAGMA_UNROLL
+    for ( gemm_k_iterations_1 = Shape0::kN / Shape1::kK - (Base::kStages - 1); 
+            gemm_k_iterations_1 > (-Base::kStages + 1); gemm_k_iterations_1--) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations1;
+           ++warp_mma_k) {
+
+        // Load warp-level tile from accumulator fragment
+        // skip warp tile loading for the last kgroup
+        if(gemm_k_iterations_1 > (-Base::kStages + 2) || warp_mma_k < Base::kWarpGemmIterations1 - 1) {
+            warp_tile_iterator_A1_.load(warp_loaded_frag_A1[(warp_mma_k + 1) % 2]);
+        }
+        ++warp_tile_iterator_A1_;
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+        this->warp_tile_iterator_B1_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations1);
+        this->warp_tile_iterator_B1_.load(warp_loaded_frag_B1[(warp_mma_k + 1) % 2]);
+        ++this->warp_tile_iterator_B1_;
+
+
+        if (warp_mma_k > 0)
+          warp_mma1.transform(warp_transformed_frag_A1[warp_mma_k % 2],
+                             warp_transformed_frag_B1[warp_mma_k % 2],
+                             warp_loaded_frag_A1[warp_mma_k % 2],
+                             warp_loaded_frag_B1[warp_mma_k % 2]);
+
+        // Issue global->shared copies for the next stage
+        int group_start_iteration_B1;
+
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations1) {
+          group_start_iteration_B1 = 0;
+        } else {
+          group_start_iteration_B1 =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupB1;
+        }
+
+        copy_tiles_and_advance_1(iterator_B1,
+                               group_start_iteration_B1);
+
+        warp_mma1(
+                accum, 
+                warp_transformed_frag_A1[warp_mma_k % 2],
+                warp_transformed_frag_B1[warp_mma_k % 2], 
+                accum
+                );
+
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations1)
+          warp_mma1.transform(warp_transformed_frag_A1[(warp_mma_k + 1) % 2],
+                             warp_transformed_frag_B1[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_A1[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_B1[(warp_mma_k + 1) % 2]);
+
+        if (warp_mma_k + 2 == Base::kWarpGemmIterations1) {
+          // Inserts a fence to group cp.async instructions into stages.
+          cutlass::arch::cp_async_fence();
+
+          // Waits until kStages-2 stages of cp.async have committed
+          arch::cp_async_wait<Base::kStages - 2>();
+          __syncthreads();
+
+          // Move to the next stage
+          iterator_B1.advance();
+
+          this->smem_iterator_B1_.add_tile_offset({1, 0});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (smem_write_stage_idx == (Base::kStages - 1)) {
+            this->smem_iterator_B1_.add_tile_offset({-Base::kStages, 0});
+            smem_write_stage_idx = 0;
+          } else {
+            ++smem_write_stage_idx;
+          }
+
+          if (smem_read_stage_idx == (Base::kStages - 1)) {
+            this->warp_tile_iterator_B1_.add_tile_offset(
+                {-Base::kStages * Policy1::kPartitionsK *
+                     Base::kWarpGemmIterations1,
+                 0});
+            smem_read_stage_idx = 0;
+          } else {
+            ++smem_read_stage_idx;
+          }
+
+        }
+      }
+
+    }
+
+    // Insert fence and wait for all outstanding cp.async operations to commit.
+    cutlass::arch::cp_async_fence();
+    cutlass::arch::cp_async_wait<0>();
+    __syncthreads();
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_pipelined.h b/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_pipelined.h
index 92b069bb..6d0f6db4 100644
--- a/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_pipelined.h
+++ b/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_pipelined.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_pipelined_smem_accumulator.h b/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_pipelined_smem_accumulator.h
index ea462f12..04e4f29f 100644
--- a/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_pipelined_smem_accumulator.h
+++ b/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_pipelined_smem_accumulator.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -67,8 +73,7 @@ template <
   /// (concept: WriteableTileIterator | RandomAccessTileIterator)
   typename SmemIteratorB0_,
   /// Iterates over vectors of scale and bias vector in global memory
-  //  (concept: ReadableTileIterator | ForwardTileIterator |
-  //  MaskedTileIterator)
+  //  (concept: VectorIterator)
   typename IteratorAccumulatorScaleBias_,
   /// Iterates over accumulator tile
   typename FragmentIteratorAccumulator_,
@@ -94,19 +99,19 @@ template <
   typename Policy0_,
   /// Policy describing tuning details (concept: MmaPolicy)
   typename Policy1_,
-  /// Transformation applied to A operand
+  /// Transformation applied to A0 operand
   typename TransformA0_ = NumericArrayConverter<
     typename SmemIteratorA0_::Element, 
     typename IteratorA0_::Element, 
     IteratorA0_::Fragment::kElements>,
   ///
-  /// Transformation applied to B operand
+  /// Transformation applied to B0 operand
   typename TransformB0_ = NumericArrayConverter<
     typename SmemIteratorB0_::Element, 
     typename IteratorB0_::Element, 
     IteratorB0_::Fragment::kElements>,
   ///
-  /// Transformation applied to B operand
+  /// Transformation applied to B1 operand
   typename TransformB1_ = NumericArrayConverter<
     typename SmemIteratorB1_::Element, 
     typename IteratorB1_::Element, 
@@ -396,7 +401,6 @@ class B2bImplicitGemmPipelinedSmemAccumulator :
 
           iterator_A.load(tb_frag_A);
           iterator_B0.load(tb_frag_B0);
-    
           ++iterator_A;
           ++iterator_B0;
         }
@@ -452,11 +456,8 @@ class B2bImplicitGemmPipelinedSmemAccumulator :
 
     smem_write_stage_idx = 1;
     
-//    int gemm_k_iterations_1 = FragmentIteratorA1::Policy::kIterations / Base::kWarpGemmIterations1;
     int gemm_k_iterations_1 = Shape0::kN / Shape1::kK;
 
-    // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing 
-    // shared memory loads (which have the tighest latency requirement).
 
     //
     // Mainloop
@@ -477,6 +478,7 @@ class B2bImplicitGemmPipelinedSmemAccumulator :
 
         if (warp_mma_k == Base::kWarpGemmIterations1 - 1) {
 
+          // Write fragments to shared memory
           this->smem_iterator_B1_.store(transform_B1(tb_frag_B1));
 
           __syncthreads();
@@ -489,7 +491,8 @@ class B2bImplicitGemmPipelinedSmemAccumulator :
           }
           else {
             this->warp_tile_iterator_B1_.add_tile_offset(
-                {-Base::kStages * Policy1::kPartitionsK * Base::kWarpGemmIterations1,
+                {-Base::kStages * Policy1::kPartitionsK *
+                     Base::kWarpGemmIterations1,
                  0});
           }
 
@@ -501,7 +504,7 @@ class B2bImplicitGemmPipelinedSmemAccumulator :
         
         // skip warp tile loading for the last kgroup
         if(gemm_k_iterations_1 > 1 || warp_mma_k < Base::kWarpGemmIterations1 - 1)
-            warp_tile_iterator_A1_.load(warp_frag_A1[(warp_mma_k + 1) % 2]);
+          warp_tile_iterator_A1_.load(warp_frag_A1[(warp_mma_k + 1) % 2]);
         this->warp_tile_iterator_B1_.load(warp_frag_B1[(warp_mma_k + 1) % 2]);
 
         ++warp_tile_iterator_A1_;
diff --git a/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_base.h b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_base.h
index fc2576f0..36d6d8c9 100644
--- a/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_base.h
+++ b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_base.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_base_smem_accumulator.h b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_base_smem_accumulator.h
index 67c2d22b..22643570 100644
--- a/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_base_smem_accumulator.h
+++ b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_base_smem_accumulator.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -64,11 +70,11 @@ template <
     /// Used for partial specialization
     typename Enable = bool>
 class B2bMmaBaseSmemAccumulator :
-  public B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_, 2> {
+  public B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_, Stages> {
 
  public:
   ///< Base class
-  using Base = B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_, 2>;
+  using Base = B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_, Stages>;
 
   ///< Size of the Gemm problem - concept: gemm::GemmShape<>
   using Shape0 = Shape0_;
@@ -146,7 +152,6 @@ class B2bMmaBaseSmemAccumulator :
     AccumulatorSharedStorage0 accumulator_shared_storage0;
   }; 
 
-
 public:
 
   /// Construct from tensor references
diff --git a/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_multistage.h b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_multistage.h
index 9ef21275..7f9dba0d 100644
--- a/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_multistage.h
+++ b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_multistage.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_multistage_smem_accumulator.h b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_multistage_smem_accumulator.h
new file mode 100644
index 00000000..2a5fb933
--- /dev/null
+++ b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_multistage_smem_accumulator.h
@@ -0,0 +1,860 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h"
+
+#include "threadblock/b2b_mma_base_smem_accumulator.h"
+#include "cutlass/epilogue/threadblock/epilogue_smem_accumulator.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape0_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA0_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA0_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA0,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB0_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB0_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB0,
+    /// Iterates over vectors of scale and bias vector in global memory
+    //  (concept: VectorIterator)
+    typename IteratorAccumulatorScaleBias_,
+    /// Iterates over accumulator tile
+    typename FragmentIteratorAccumulator_,
+    /// Iterates over accumulator tile in shared memory
+    typename SmemIteratorD0_,
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape1_,
+    /// Iterates over the intermediate accumulator tile in shared memory
+    typename WarpIteratorA1_,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB1_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB1_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB1,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Output operator for 1st Gemm(concept: epilogue::thread::LinearCombinationClamp, etc...) 
+    typename OutputOp_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy0_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy1_,
+    /// Number of stages,
+    int Stages,
+    /// Used for partial specialization
+    typename Enable = bool>
+class B2bMmaMultistageSmemAccumulator : 
+  public gemm::threadblock::B2bMmaBaseSmemAccumulator<Shape0_, Shape1_, Policy0_, Policy1_, SmemIteratorD0_, Stages> {
+public:
+  ///< Base class
+  using Base = gemm::threadblock::B2bMmaBaseSmemAccumulator<Shape0_, Shape1_, Policy0_, Policy1_, SmemIteratorD0_, Stages>;
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape0 = Shape0_;
+  ///< Iterates over tiles of A operand in global memory
+  using IteratorA0 = IteratorA0_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB0 = IteratorB0_;
+  ///< Iterates over tiles of the scale and bias vectors in global memory
+  using IteratorAccumulatorScaleBias = IteratorAccumulatorScaleBias_;
+  ///< Policy describing tuning details
+  using Policy0 = Policy0_;
+
+  using SmemIteratorA0 = SmemIteratorA0_;
+  using SmemIteratorB0 = SmemIteratorB0_;
+  using SmemIteratorD0 = SmemIteratorD0_; ///< Iterates over accumulator tile in shared memory
+
+  using FragmentIteratorAccumulator = FragmentIteratorAccumulator_;  ///< Iterates over accumulator tile
+
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape1 = Shape1_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB1 = IteratorB1_;
+  ///< Policy describing tuning details
+  using Policy1 = Policy1_;
+
+  using SmemIteratorB1 = SmemIteratorB1_;
+  using WarpIteratorA1 = WarpIteratorA1_;   ///< Iterates over the intermediate accumulator tile in shared memory
+
+  ///< Data type of accumulator matrix
+  using ElementC = ElementC_;
+  ///< Layout of accumulator matrix
+  using LayoutC = LayoutC_;
+
+  ///< Epilogue after 1st Gemm
+  using OutputOp = OutputOp_;
+ 
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA0 = CacheOpA0;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB0 = CacheOpB0;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB1 = CacheOpB1;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of accumulator tile
+  using FragmentC0 = typename Policy0::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator0 = typename Policy0::Operator;
+
+  /// Fragment of accumulator tile
+  using FragmentC1 = typename Policy1::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator1 = typename Policy1::Operator;
+
+  /// Epilog in shared memory
+  using Epilogue0 = epilogue::threadblock::EpilogueSmemAccumulator<
+    SmemIteratorD0,                 ///< SmemTileIterator
+    FragmentIteratorAccumulator,    ///< AccumulatorFragmentIterator
+    IteratorAccumulatorScaleBias,   ///< ScaleBiasIterator
+    OutputOp>;                      ///< Output operator
+ 
+  /// Minimum architecture is Sm80 to support cp.async
+  using ArchTag = arch::Sm80;
+  
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA0 = Operator0::kTransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB0 = Operator0::kTransformB;
+  
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB1 = Operator1::kTransformB;
+
+  /// Internal structure exposed for introspection.
+  struct Detail {
+
+    static_assert(Base::kWarpGemmIterations0 > 1,
+                  "The pipelined structure requires at least two warp-level "
+                  "GEMM operations.");
+    static_assert(Base::kWarpGemmIterations1 > 1,
+                  "The pipelined structure requires at least two warp-level "
+                  "GEMM operations.");
+
+    /// Number of cp.async instructions to load one stage of operand A
+    static int const TBLDGSTSIterationsA0 =
+        IteratorA0::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const TBLDGSTSIterationsB0 =
+        IteratorB0::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const TBLDGSTSIterationsB1 =
+        IteratorB1::ThreadMap::Iterations::kCount;
+
+    /// Number of stages
+    static int const kStages = Stages;
+
+    /// Number of cp.async instructions to load on group of operand A
+    static int const kAccessesPerGroupA0 =
+        (TBLDGSTSIterationsA0 + Base::kWarpGemmIterations0 - 1) / Base::kWarpGemmIterations0;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB0 =
+        (TBLDGSTSIterationsB0 + Base::kWarpGemmIterations0 - 1) / Base::kWarpGemmIterations0;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB1 =
+        (TBLDGSTSIterationsB1 + Base::kWarpGemmIterations1 - 1) / Base::kWarpGemmIterations1;
+  };
+
+ private:
+
+  using WarpLoadedFragmentA0 = typename Operator0::FragmentA;
+  using WarpLoadedFragmentB0 = typename Operator0::FragmentB;
+  using WarpLoadedFragmentA1 = typename Operator1::FragmentA;
+  using WarpLoadedFragmentB1 = typename Operator1::FragmentB;
+  using WarpTransformedFragmentA0 = typename Operator0::TransformedFragmentA;
+  using WarpTransformedFragmentB0 = typename Operator0::TransformedFragmentB;
+  using WarpTransformedFragmentA1 = typename Operator1::TransformedFragmentA;
+  using WarpTransformedFragmentB1 = typename Operator1::TransformedFragmentB;
+
+ private:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA0 smem_iterator_A0_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB0 smem_iterator_B0_;
+
+  /// Shared Memory Iterator to store accumulator tile
+  SmemIteratorD0 smem_iterator_D0_;
+    
+  /// Iterator to load a warp-scoped tile of A1 operand from intermediate accumulator tile
+  WarpIteratorA1 warp_tile_iterator_A1_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB1 smem_iterator_B1_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  B2bMmaMultistageSmemAccumulator(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::B2bMmaSharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx
+    ):
+      Base(shared_storage, thread_idx, warp_idx, lane_idx),
+      smem_iterator_A0_(shared_storage.b2b_mma_shared_storage.shared_storage0.operand_A_ref(), thread_idx),
+      smem_iterator_B0_(shared_storage.b2b_mma_shared_storage.shared_storage0.operand_B_ref(), thread_idx),
+      smem_iterator_D0_(shared_storage.accumulator_shared_storage0.accum_ref(), lane_idx),
+      warp_tile_iterator_A1_(shared_storage.accumulator_shared_storage0.accum_ref(), lane_idx),
+      smem_iterator_B1_(shared_storage.b2b_mma_shared_storage.shared_storage1.operand_B_ref(), thread_idx)
+  {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn_0 = warp_idx % (Base::WarpCount0::kM * Base::WarpCount0::kN);
+    int warp_idx_k_0 = warp_idx / (Base::WarpCount0::kM * Base::WarpCount0::kN);
+
+    int warp_idx_m_0 = warp_idx_mn_0 % Base::WarpCount0::kM;
+    int warp_idx_n_0 = warp_idx_mn_0 / Base::WarpCount0::kM;
+
+    int warp_idx_mn_1 = warp_idx % (Base::WarpCount1::kM * Base::WarpCount1::kN);
+    int warp_idx_k_1 = warp_idx / (Base::WarpCount1::kM * Base::WarpCount1::kN);
+
+    int warp_idx_m_1 = warp_idx_mn_1 % Base::WarpCount1::kM;
+    int warp_idx_n_1 = warp_idx_mn_1 / Base::WarpCount1::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A0_.add_tile_offset(
+        {warp_idx_m_0, Base::kWarpGemmIterations0 * warp_idx_k_0});
+    this->warp_tile_iterator_B0_.add_tile_offset(
+        {Base::kWarpGemmIterations0 * warp_idx_k_0, warp_idx_n_0});
+    warp_tile_iterator_A1_.add_tile_offset(
+        {warp_idx_m_1, Base::kWarpGemmIterations1 * warp_idx_k_1});
+    this->warp_tile_iterator_B1_.add_tile_offset(
+        {Base::kWarpGemmIterations1 * warp_idx_k_1, warp_idx_n_1});
+
+    // Add smem accumulator iterator warp offset
+    smem_iterator_D0_.add_tile_offset({ warp_idx_m_0 * SmemIteratorD0::TileIterations::kRow, 
+                                        warp_idx_n_0 * SmemIteratorD0::TileIterations::kColumn});
+  }
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance_0(IteratorA0 &iterator_A0, IteratorB0 &iterator_B0,
+                              int group_start_A0 = 0, int group_start_B0 = 0) {
+    iterator_A0.set_iteration_index(group_start_A0 *
+                                   IteratorA0::kAccessesPerVector);
+    this->smem_iterator_A0_.set_iteration_index(group_start_A0);
+
+    // LDGSTS for operand A
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupA0; ++j) {
+      if (group_start_A0 + j < Detail::TBLDGSTSIterationsA0) {
+        typename IteratorA0::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA0::AccessType *>(
+                this->smem_iterator_A0_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorA0::Element>::value *
+                              IteratorA0::ThreadMap::kElementsPerAccess /
+                              IteratorA0::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA0::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_A0.get();
+
+          cutlass::arch::cp_async<kSrcBytes, kCacheOpA0>(
+              dst_ptr + v, gmem_ptr, iterator_A0.valid());
+
+          ++iterator_A0;
+        }
+
+        ++this->smem_iterator_A0_;
+      }
+    }
+
+    iterator_B0.set_iteration_index(group_start_B0 *
+                                   IteratorB0::kAccessesPerVector);
+    this->smem_iterator_B0_.set_iteration_index(group_start_B0);
+
+    // LDGSTS for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB0; ++j) {
+      if (group_start_B0 + j < Detail::TBLDGSTSIterationsB0) {
+        typename IteratorB0::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB0::AccessType *>(
+                this->smem_iterator_B0_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorB0::Element>::value *
+                              IteratorB0::ThreadMap::kElementsPerAccess /
+                              IteratorB0::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB0::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_B0.get();
+
+          cutlass::arch::cp_async<kSrcBytes, kCacheOpB0>(
+              dst_ptr + v, gmem_ptr, iterator_B0.valid());
+
+          ++iterator_B0;
+        }
+        ++this->smem_iterator_B0_;
+      }
+    }
+  }
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance_1(IteratorB1 &iterator_B1,
+                              int group_start_B1 = 0) {
+    iterator_B1.set_iteration_index(group_start_B1 *
+                                   IteratorB1::kAccessesPerVector);
+    this->smem_iterator_B1_.set_iteration_index(group_start_B1);
+
+    // LDGSTS for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB1; ++j) {
+      if (group_start_B1 + j < Detail::TBLDGSTSIterationsB1) {
+        typename IteratorB1::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB1::AccessType *>(
+                this->smem_iterator_B1_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorB1::Element>::value *
+                              IteratorB1::ThreadMap::kElementsPerAccess /
+                              IteratorB1::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB1::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_B1.get();
+
+          cutlass::arch::cp_async<kSrcBytes, kCacheOpB1>(
+              dst_ptr + v, gmem_ptr, iterator_B1.valid());
+
+          ++iterator_B1;
+        }
+        ++this->smem_iterator_B1_;
+      }
+    }
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations_0,
+      ///< destination accumulator tile
+      FragmentC1 &accum,
+      ///< iterator over A operand in global memory
+      IteratorA0 iterator_A0,
+      ///< iterator over B operand in global memory
+      IteratorB0 iterator_B0,
+      ///< iterator over B operand in global memory
+      IteratorB1 iterator_B1,
+      ///< initial value of accumulator
+      FragmentC0 const &src_accum,
+      ///< epilogue operation after 1st Gemm
+      OutputOp output_op_0)
+    {
+    //
+    // Prologue
+    //
+
+    // Issue several complete stages
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < Base::kStages - 1;
+         ++stage, --gemm_k_iterations_0) {
+
+      iterator_A0.clear_mask(gemm_k_iterations_0 == 0);
+      iterator_B0.clear_mask(gemm_k_iterations_0 == 0);
+
+      iterator_A0.set_iteration_index(0);
+      this->smem_iterator_A0_.set_iteration_index(0);
+
+      // LDGSTS for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::TBLDGSTSIterationsA0; ++j) {
+        typename IteratorA0::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA0::AccessType *>(
+                this->smem_iterator_A0_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA0::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorA0::Element>::value *
+              IteratorA0::ThreadMap::kElementsPerAccess /
+              IteratorA0::kAccessesPerVector / 8;
+
+          int src_bytes = (iterator_A0.valid() ? kSrcBytes : 0);
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA0>(
+              dst_ptr + v, iterator_A0.get(), iterator_A0.valid());
+
+          ++iterator_A0;
+        }
+
+        ++this->smem_iterator_A0_;
+      }
+
+      iterator_B0.set_iteration_index(0);
+      this->smem_iterator_B0_.set_iteration_index(0);
+
+      // LDGSTS for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::TBLDGSTSIterationsB0; ++j) {
+        typename IteratorB0::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB0::AccessType *>(
+                this->smem_iterator_B0_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB0::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorB0::Element>::value *
+              IteratorB0::ThreadMap::kElementsPerAccess /
+              IteratorB0::kAccessesPerVector / 8;
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB0>(
+              dst_ptr + v, iterator_B0.get(), iterator_B0.valid());
+
+          ++iterator_B0;
+        }
+
+        ++this->smem_iterator_B0_;
+      }
+
+      // Move to the next stage
+      iterator_A0.add_tile_offset({0, 1});
+      iterator_B0.add_tile_offset({1, 0});
+
+      this->smem_iterator_A0_.add_tile_offset({0, 1});
+      this->smem_iterator_B0_.add_tile_offset({1, 0});
+
+      // Defines the boundary of a stage of cp.async.
+      cutlass::arch::cp_async_fence();
+    }
+
+    // Perform accumulation in the 'd' output operand
+    FragmentC0 accum0 = src_accum;
+
+    // DEPBAR+SYNC
+    cutlass::arch::cp_async_wait<Base::kStages - 2>();
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpLoadedFragmentA0 warp_loaded_frag_A0[2];
+    WarpLoadedFragmentB0 warp_loaded_frag_B0[2];
+    WarpTransformedFragmentA0 warp_transformed_frag_A0[2];
+    WarpTransformedFragmentB0 warp_transformed_frag_B0[2];
+
+    Operator0 warp_mma0;
+
+    this->warp_tile_iterator_A0_.set_kgroup_index(0);
+    this->warp_tile_iterator_B0_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A0_.load(warp_loaded_frag_A0[0]);
+    this->warp_tile_iterator_B0_.load(warp_loaded_frag_B0[0]);
+
+    ++this->warp_tile_iterator_A0_;
+    ++this->warp_tile_iterator_B0_;
+
+    iterator_A0.clear_mask(gemm_k_iterations_0 == 0);
+    iterator_B0.clear_mask(gemm_k_iterations_0 == 0);
+
+    int smem_write_stage_idx = Base::kStages - 1;
+    int smem_read_stage_idx = 0;
+
+    warp_mma0.transform(warp_transformed_frag_A0[0], warp_transformed_frag_B0[0],
+                       warp_loaded_frag_A0[0], warp_loaded_frag_B0[0]);
+
+    //
+    // Mainloop
+    //
+
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations_0 > (-Base::kStages + 1);) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations0;
+           ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+
+        this->warp_tile_iterator_A0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0);
+        this->warp_tile_iterator_B0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0);
+        
+        this->warp_tile_iterator_A0_.load(warp_loaded_frag_A0[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_B0_.load(warp_loaded_frag_B0[(warp_mma_k + 1) % 2]);
+
+        ++this->warp_tile_iterator_A0_;
+        ++this->warp_tile_iterator_B0_;
+
+        if (warp_mma_k > 0)
+          warp_mma0.transform(warp_transformed_frag_A0[warp_mma_k % 2],
+                             warp_transformed_frag_B0[warp_mma_k % 2],
+                             warp_loaded_frag_A0[warp_mma_k % 2],
+                             warp_loaded_frag_B0[warp_mma_k % 2]);
+
+        warp_mma0(
+          accum0, 
+          warp_transformed_frag_A0[warp_mma_k % 2],
+          warp_transformed_frag_B0[warp_mma_k % 2], 
+          accum0
+        );
+
+        // Issue global->shared copies for the this stage
+        if (warp_mma_k < Base::kWarpGemmIterations0 - 1) {
+          int group_start_iteration_A0, group_start_iteration_B0;
+
+          group_start_iteration_A0 = warp_mma_k * Detail::kAccessesPerGroupA0;
+          group_start_iteration_B0 = warp_mma_k * Detail::kAccessesPerGroupB0;
+
+          copy_tiles_and_advance_0(iterator_A0, iterator_B0, group_start_iteration_A0, 
+                               group_start_iteration_B0);
+        }
+
+        if (warp_mma_k + 2 == Base::kWarpGemmIterations0) {
+          int group_start_iteration_A0, group_start_iteration_B0;
+          group_start_iteration_A0 =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupA0;
+          group_start_iteration_B0 =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupB0;
+
+          copy_tiles_and_advance_0(iterator_A0, iterator_B0, group_start_iteration_A0, 
+                               group_start_iteration_B0);
+
+          // Inserts a memory fence between stages of cp.async instructions.
+          cutlass::arch::cp_async_fence();
+
+          // Waits until kStages-2 stages have committed.
+          arch::cp_async_wait<Base::kStages - 2>();
+          __syncthreads();
+
+          // Move to the next stage
+          iterator_A0.add_tile_offset({0, 1});
+          iterator_B0.add_tile_offset({1, 0});
+
+          this->smem_iterator_A0_.add_tile_offset({0, 1});
+          this->smem_iterator_B0_.add_tile_offset({1, 0});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (smem_write_stage_idx == (Base::kStages - 1)) {
+            this->smem_iterator_A0_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B0_.add_tile_offset({-Base::kStages, 0});
+            smem_write_stage_idx = 0;
+          } else {
+            ++smem_write_stage_idx;
+          }
+
+          if (smem_read_stage_idx == (Base::kStages - 1)) {
+            this->warp_tile_iterator_A0_.add_tile_offset(
+                {0, -Base::kStages * Policy0::kPartitionsK *
+                        Base::kWarpGemmIterations0});
+            this->warp_tile_iterator_B0_.add_tile_offset(
+                {-Base::kStages * Policy0::kPartitionsK *
+                     Base::kWarpGemmIterations0,
+                 0});
+            smem_read_stage_idx = 0;
+          } else {
+            ++smem_read_stage_idx;
+          }
+
+          --gemm_k_iterations_0;
+          iterator_A0.clear_mask(gemm_k_iterations_0 == 0);
+          iterator_B0.clear_mask(gemm_k_iterations_0 == 0);
+        }
+
+        // Do any conversions feeding the first stage at the end of the loop so
+        // we can start right away on mma instructions
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations0)
+          warp_mma0.transform(warp_transformed_frag_A0[(warp_mma_k + 1) % 2],
+                             warp_transformed_frag_B0[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_A0[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_B0[(warp_mma_k + 1) % 2]);
+      }
+
+    }
+
+    /// Epilogue for the first Implicit Gemm
+    Epilogue0 epilogue0;
+
+    epilogue0(output_op_0, smem_iterator_D0_, accum0);
+
+    __syncthreads();
+
+
+    // 2nd Gemm
+
+    //
+    // Prologue
+    //
+    int gemm_k_iterations_1 = Shape0::kN / Shape1::kK;
+
+    // Issue several complete stages
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < Base::kStages - 1;
+         ++stage, --gemm_k_iterations_1) {
+
+      iterator_B1.clear_mask(gemm_k_iterations_1 == 0);
+
+      iterator_B1.set_iteration_index(0);
+      this->smem_iterator_B1_.set_iteration_index(0);
+
+      // LDGSTS for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::TBLDGSTSIterationsB1; ++j) {
+        typename IteratorB1::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB1::AccessType *>(
+                this->smem_iterator_B1_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB1::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorB1::Element>::value *
+              IteratorB1::ThreadMap::kElementsPerAccess /
+              IteratorB1::kAccessesPerVector / 8;
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB1>(
+              dst_ptr + v, iterator_B1.get(), iterator_B1.valid());
+
+          ++iterator_B1;
+        }
+
+        ++this->smem_iterator_B1_;
+      }
+
+      // Move to the next stage
+      iterator_B1.add_tile_offset({1, 0});
+
+      this->smem_iterator_B1_.add_tile_offset({1, 0});
+
+      // Defines the boundary of a stage of cp.async.
+      cutlass::arch::cp_async_fence();
+    }
+
+    // DEPBAR+SYNC
+    cutlass::arch::cp_async_wait<Base::kStages - 2>();
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpLoadedFragmentA1 warp_loaded_frag_A1[2];
+    WarpLoadedFragmentB1 warp_loaded_frag_B1[2];
+    WarpTransformedFragmentA1 warp_transformed_frag_A1[2];
+    WarpTransformedFragmentB1 warp_transformed_frag_B1[2];
+
+    Operator1 warp_mma1;
+
+    warp_tile_iterator_A1_.load(warp_loaded_frag_A1[0]);
+    ++warp_tile_iterator_A1_;
+
+    this->warp_tile_iterator_B1_.set_kgroup_index(0);
+    this->warp_tile_iterator_B1_.load(warp_loaded_frag_B1[0]);
+    ++this->warp_tile_iterator_B1_;
+
+    iterator_B1.clear_mask(gemm_k_iterations_1 == 0);
+
+    smem_write_stage_idx = Base::kStages - 1;
+    smem_read_stage_idx = 0;
+
+    warp_mma1.transform(warp_transformed_frag_A1[0], warp_transformed_frag_B1[0],
+                       warp_loaded_frag_A1[0], warp_loaded_frag_B1[0]);
+
+    //
+    // Mainloop
+    //
+
+    CUTLASS_PRAGMA_UNROLL
+    for ( gemm_k_iterations_1 = Shape0::kN / Shape1::kK - (Base::kStages - 1); 
+            gemm_k_iterations_1 > (-Base::kStages + 1); gemm_k_iterations_1--) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations1;
+           ++warp_mma_k) {
+
+        // Load warp-level tile from accumulator fragment
+        // skip warp tile loading for the last kgroup
+        if(gemm_k_iterations_1 > (-Base::kStages + 2) || warp_mma_k < Base::kWarpGemmIterations1 - 1) {
+            warp_tile_iterator_A1_.load(warp_loaded_frag_A1[(warp_mma_k + 1) % 2]);
+        }
+        ++warp_tile_iterator_A1_;
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+        this->warp_tile_iterator_B1_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations1);
+        this->warp_tile_iterator_B1_.load(warp_loaded_frag_B1[(warp_mma_k + 1) % 2]);
+        ++this->warp_tile_iterator_B1_;
+
+
+        if (warp_mma_k > 0)
+          warp_mma1.transform(warp_transformed_frag_A1[warp_mma_k % 2],
+                             warp_transformed_frag_B1[warp_mma_k % 2],
+                             warp_loaded_frag_A1[warp_mma_k % 2],
+                             warp_loaded_frag_B1[warp_mma_k % 2]);
+
+
+        warp_mma1(
+          accum, 
+          warp_transformed_frag_A1[warp_mma_k % 2],
+          warp_transformed_frag_B1[warp_mma_k % 2], 
+          accum
+        );
+
+        // Issue global->shared copies for the this stage
+        if (warp_mma_k < Base::kWarpGemmIterations1 - 1) {
+          int group_start_iteration_B1;
+
+          group_start_iteration_B1 = warp_mma_k * Detail::kAccessesPerGroupB1;
+
+          copy_tiles_and_advance_1(iterator_B1, group_start_iteration_B1);
+        }
+
+        if (warp_mma_k + 2 == Base::kWarpGemmIterations1) {
+          int group_start_iteration_B1;
+          group_start_iteration_B1 =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupB1;
+
+          copy_tiles_and_advance_1(iterator_B1, group_start_iteration_B1);
+
+          // Inserts a memory fence between stages of cp.async instructions.
+          cutlass::arch::cp_async_fence();
+
+          // Waits until kStages-2 stages have committed.
+          arch::cp_async_wait<Base::kStages - 2>();
+          __syncthreads();
+
+          // Move to the next stage
+          iterator_B1.add_tile_offset({1, 0});
+
+          this->smem_iterator_B1_.add_tile_offset({1, 0});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (smem_write_stage_idx == (Base::kStages - 1)) {
+            this->smem_iterator_B1_.add_tile_offset({-Base::kStages, 0});
+            smem_write_stage_idx = 0;
+          } else {
+            ++smem_write_stage_idx;
+          }
+
+          if (smem_read_stage_idx == (Base::kStages - 1)) {
+            this->warp_tile_iterator_B1_.add_tile_offset(
+                {-Base::kStages * Policy1::kPartitionsK *
+                     Base::kWarpGemmIterations1,
+                 0});
+            smem_read_stage_idx = 0;
+          } else {
+            ++smem_read_stage_idx;
+          }
+
+          iterator_B1.clear_mask(gemm_k_iterations_1 == 1);
+        }
+
+        // Do any conversions feeding the first stage at the end of the loop so
+        // we can start right away on mma instructions
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations1)
+          warp_mma1.transform(warp_transformed_frag_A1[(warp_mma_k + 1) % 2],
+                             warp_transformed_frag_B1[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_A1[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_B1[(warp_mma_k + 1) % 2]);
+      }
+
+    }
+
+
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_pipelined.h b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_pipelined.h
index c8c35092..df272289 100644
--- a/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_pipelined.h
+++ b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_pipelined.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -106,7 +112,8 @@ template <
   /// Used for partial specialization
   typename Enable = bool
 >
-class B2bMmaPipelined : public B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_, 2> {
+class B2bMmaPipelined : 
+  public B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_, 2> {
 public:
 
   ///< Base class
@@ -130,7 +137,7 @@ class B2bMmaPipelined : public B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_,
 
   using ElementC = ElementC_;       ///< Data type of accumulator matrix
   using LayoutC = LayoutC_;         ///< Layout of accumulator matrix
-
+  
   using OutputOp = OutputOp_;       ///< Epilogue after 1st Gemm
 
   using TransformA0 = TransformA0_;
@@ -152,7 +159,7 @@ class B2bMmaPipelined : public B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_,
 
   /// Warp-level Mma
   using Operator0 = typename Policy0::Operator;
-
+  
   /// Fragment of operand B loaded from global memory
   using FragmentB1 = typename IteratorB1::Fragment;
 
@@ -161,7 +168,7 @@ class B2bMmaPipelined : public B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_,
 
   /// Warp-level Mma
   using Operator1 = typename Policy1::Operator;
-  
+ 
   /// Obtain the arch tag from the warp-level operator
   using ArchTag = typename Policy0::Operator::ArchTag;
 
@@ -174,7 +181,7 @@ class B2bMmaPipelined : public B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_,
   /// Complex transform on B1 operand
   static ComplexTransform const kTransformB1 = Operator1::kTransformB;
 
-  // staticaly assert kStages for MmaPipelined is two (Double-buffered pipeline)
+  /// staticaly assert kStages for MmaPipelined is two (Double-buffered pipeline)
   static_assert((Base::kStages==2), "MmaPipelined requires kStages set to value 2");
 
 private:
@@ -237,16 +244,16 @@ class B2bMmaPipelined : public B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_,
   /// Perform a threadblock-scoped matrix multiply-accumulate
   CUTLASS_DEVICE
   void operator()(
-    int gemm_k_iterations_0,                            ///< number of iterations of the mainloop
-    FragmentC1 &accum,                                  ///< destination accumulator tile
-    IteratorA0 iterator_A,                              ///< iterator over A operand in global memory
-    IteratorB0 iterator_B0,                             ///< iterator over B0 operand in global memory
-    IteratorB1 iterator_B1,                             ///< iterator over B1 operand in global memory  
-    FragmentC0 const &src_accum,                        ///< source accumualtor tile
-    OutputOp output_op_0,                               ///< epilogue operation after 1st Gemm
+    int gemm_k_iterations_0,                             ///< number of iterations of the mainloop
+    FragmentC1 &accum,                                   ///< destination accumulator tile
+    IteratorA0 iterator_A,                               ///< iterator over A operand in global memory
+    IteratorB0 iterator_B0,                              ///< iterator over B0 operand in global memory
+    IteratorB1 iterator_B1,                              ///< iterator over B1 operand in global memory  
+    FragmentC0 const &src_accum,                         ///< source accumualtor tile
+    OutputOp output_op_0,                                ///< epilogue operation after 1st Gemm
     TransformA0 transform_A0 = TransformA0(),            ///< transformation applied to A0 fragment
-    TransformB0 transform_B0 = TransformB0(),           ///< transformation applied to B0 fragment
-    TransformB1 transform_B1 = TransformB1()) {         ///< transformation applied to B1 fragment
+    TransformB0 transform_B0 = TransformB0(),            ///< transformation applied to B0 fragment
+    TransformB1 transform_B1 = TransformB1()) {          ///< transformation applied to B1 fragment
 
     //
     // Prologue
@@ -268,8 +275,8 @@ class B2bMmaPipelined : public B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_,
     ++iterator_A;
     ++iterator_B0;
 
-    this->smem_iterator_A_.store(tb_frag_A);
-    this->smem_iterator_B0_.store(tb_frag_B0);
+    this->smem_iterator_A_.store(transform_A0(tb_frag_A));
+    this->smem_iterator_B0_.store(transform_B0(tb_frag_B0));
 
     ++this->smem_iterator_A_;
     ++this->smem_iterator_B0_;
@@ -294,23 +301,19 @@ class B2bMmaPipelined : public B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_,
     int smem_write_stage_idx = 1;
 
     // Avoid reading out of bounds
-    if (gemm_k_iterations_0 <= 1) {
-      iterator_A.clear_mask();
-      iterator_B0.clear_mask();
-    }
+    iterator_A.clear_mask(gemm_k_iterations_0 <= 1);
+    iterator_B0.clear_mask(gemm_k_iterations_0 <= 1);
 
     // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing 
     // shared memory loads (which have the tighest latency requirement).
-    iterator_A.load(tb_frag_A);
 
     //
     // Mainloop
     //
 
-    // Note: The main loop does not support Base::WarpGemmIterations == 2.
+    // Note: The main loop does not support Base::kWarpGemmIterations == 2.
     CUTLASS_GEMM_LOOP
     for (; gemm_k_iterations_0 > 0; --gemm_k_iterations_0) {
-
       //
       // Loop over GEMM K dimension
       //
@@ -324,19 +327,14 @@ class B2bMmaPipelined : public B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_,
         if (warp_mma_k == Base::kWarpGemmIterations0 - 1) {
 
           // Write fragments to shared memory
-          this->smem_iterator_A_.store(tb_frag_A);
+          this->smem_iterator_A_.store(transform_A0(tb_frag_A));
 
-          this->smem_iterator_B0_.store(tb_frag_B0);
+          this->smem_iterator_B0_.store(transform_B0(tb_frag_B0));
 
           __syncthreads();
-
-          // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing 
-          // shared memory loads (which have the tighest latency requirement).
-          iterator_A.load(tb_frag_A);
           
-          ++this->smem_iterator_B0_;
           ++this->smem_iterator_A_;
-        
+          ++this->smem_iterator_B0_;
 
           // Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory
           if (smem_write_stage_idx == 1) {
@@ -365,19 +363,18 @@ class B2bMmaPipelined : public B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_,
 
         if (warp_mma_k == 0) {
 
+          iterator_A.load(tb_frag_A);
           iterator_B0.load(tb_frag_B0);
-
           ++iterator_A;
           ++iterator_B0;
 
           // Avoid reading out of bounds if this was the last loop iteration
-          if (gemm_k_iterations_0 <= 2) {
-            iterator_A.clear_mask();
-            iterator_B0.clear_mask();
-          }
+          iterator_A.clear_mask(gemm_k_iterations_0 <= 2);
+          iterator_B0.clear_mask(gemm_k_iterations_0 <= 2);
         }
 
-        warp_mma0(accum0, warp_frag_A0[warp_mma_k % 2], warp_frag_B0[warp_mma_k % 2], accum0);
+        warp_mma0(accum0, warp_frag_A0[warp_mma_k % 2],
+                  warp_frag_B0[warp_mma_k % 2], accum0);
       }
     }
 
@@ -399,7 +396,7 @@ class B2bMmaPipelined : public B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_,
 
     ++iterator_B1;
 
-    this->smem_iterator_B1_.store(tb_frag_B1);
+    this->smem_iterator_B1_.store(transform_B1(tb_frag_B1));
 
     ++this->smem_iterator_B1_;
 
@@ -409,7 +406,6 @@ class B2bMmaPipelined : public B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_,
     WarpFragmentA1 warp_frag_A1[2];
     WarpFragmentB1 warp_frag_B1[2];
 
-    //warp_tile_iterator_A1_.set_kgroup_index(0);
     this->warp_tile_iterator_B1_.set_kgroup_index(0);
 
     warp_tile_iterator_A1_.load(warp_frag_A1[0], output_op_0);
@@ -425,9 +421,7 @@ class B2bMmaPipelined : public B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_,
     int gemm_k_iterations_1 = FragmentIteratorA1::Policy::kIterations / Base::kWarpGemmIterations1;
 
     // Avoid reading out of bounds
-    if (gemm_k_iterations_1 <= 1) {
-      iterator_B1.clear_mask();
-    }
+    iterator_B1.clear_mask(gemm_k_iterations_1 <= 1);
 
     //
     // Mainloop
@@ -450,8 +444,7 @@ class B2bMmaPipelined : public B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_,
         if (warp_mma_k == Base::kWarpGemmIterations1 - 1) {
 
           // Write fragments to shared memory
-
-          this->smem_iterator_B1_.store(tb_frag_B1);
+          this->smem_iterator_B1_.store(transform_B1(tb_frag_B1));
 
           __syncthreads();
           ++this->smem_iterator_B1_;
@@ -471,11 +464,10 @@ class B2bMmaPipelined : public B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_,
         }
 
         this->warp_tile_iterator_B1_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations1);
-        
+
         warp_tile_iterator_A1_.load(warp_frag_A1[(warp_mma_k + 1) % 2], output_op_0);
         this->warp_tile_iterator_B1_.load(warp_frag_B1[(warp_mma_k + 1) % 2]);
 
-
         ++warp_tile_iterator_A1_;
         ++this->warp_tile_iterator_B1_;
 
@@ -484,17 +476,14 @@ class B2bMmaPipelined : public B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_,
           iterator_B1.load(tb_frag_B1);
           ++iterator_B1;
 
-
           // Avoid reading out of bounds if this was the last loop iteration
-          if (gemm_k_iterations_1 <= 2) {
-            iterator_B1.clear_mask();
-          }
+          iterator_B1.clear_mask(gemm_k_iterations_1 <= 2);
         }
 
-        warp_mma1(accum, warp_frag_A1[warp_mma_k % 2], warp_frag_B1[warp_mma_k % 2], accum);
+        warp_mma1(accum, warp_frag_A1[warp_mma_k % 2], 
+                  warp_frag_B1[warp_mma_k % 2], accum);
       }
     }
-
   }
 };
 
diff --git a/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_pipelined_smem_accumulator.h b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_pipelined_smem_accumulator.h
new file mode 100644
index 00000000..4cd89ed9
--- /dev/null
+++ b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_pipelined_smem_accumulator.h
@@ -0,0 +1,541 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped Back-to-back fused GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/numeric_conversion.h"
+
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h"
+
+#include "threadblock/b2b_mma_base_smem_accumulator.h"
+#include "cutlass/epilogue/threadblock/epilogue_smem_accumulator.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape0_,
+  /// Iterates over tiles of A operand in global memory 
+  //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
+  typename IteratorA0_,
+  /// Iterates over tiles of A operand in shared memory
+  /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+  typename SmemIteratorA0_,
+  /// Iterates over tiles of B operand in global memory
+  //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
+  typename IteratorB0_,
+  /// Iterates over tiles of B operand in shared memory
+  /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+  typename SmemIteratorB0_,
+  /// Iterates over vectors of scale and bias vector in global memory
+  //  (concept: VectorIterator)
+  typename IteratorAccumulatorScaleBias_,
+  /// Iterates over accumulator tile
+  typename FragmentIteratorAccumulator_,
+  /// Iterates over accumulator tile in shared memory
+  typename SmemIteratorD0_,
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape1_,
+  /// Iterates over the intermediate accumulator tile in shared memory
+  typename WarpIteratorA1_,
+  /// Iterates over tiles of B operand in global memory
+  //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
+  typename IteratorB1_,
+  /// Iterates over tiles of B operand in shared memory
+  /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+  typename SmemIteratorB1_,
+  /// Data type of accumulator matrix
+  typename ElementC_,
+  /// Data type of accumulator matrix
+  typename LayoutC_,
+  /// Output operator for 1st Gemm(concept: epilogue::thread::LinearCombinationClamp, etc...) 
+  typename OutputOp_,
+  /// Policy describing tuning details (concept: MmaPipelinedPolicy)
+  typename Policy0_,
+  /// Policy describing tuning details (concept: MmaPipelinedPolicy)
+  typename Policy1_,
+  /// Transformation applied to A0 operand
+  typename TransformA0_ = NumericArrayConverter<
+    typename SmemIteratorA0_::Element, 
+    typename IteratorA0_::Element, 
+    IteratorA0_::Fragment::kElements>,
+  ///
+  /// Transformation applied to B0 operand
+  typename TransformB0_ = NumericArrayConverter<
+    typename SmemIteratorB0_::Element, 
+    typename IteratorB0_::Element, 
+    IteratorB0_::Fragment::kElements>,
+  ///
+  /// Transformation applied to B1 operand
+  typename TransformB1_ = NumericArrayConverter<
+    typename SmemIteratorB1_::Element, 
+    typename IteratorB1_::Element, 
+    IteratorB1_::Fragment::kElements>,
+  /// Used for partial specialization
+  typename Enable = bool
+>
+class B2bMmaPipelinedSmemAccumulator : 
+  public B2bMmaBaseSmemAccumulator<Shape0_, Shape1_, Policy0_, Policy1_, SmemIteratorD0_, 2> {
+public:
+
+  ///< Base class
+  using Base = B2bMmaBaseSmemAccumulator<Shape0_, Shape1_, Policy0_, Policy1_, SmemIteratorD0_, 2>;
+
+  using Shape0 = Shape0_;             ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using IteratorA0 = IteratorA0_;     ///< Iterates over tiles of A operand in global memory
+  using IteratorB0 = IteratorB0_;     ///< Iterates over tiles of B operand in global memory
+  using IteratorAccumulatorScaleBias = IteratorAccumulatorScaleBias_;   ///< Iterates over tiles of the scale and bias vectors in global memory
+  using Policy0 = Policy0_;           ///< Policy0 describing tuning details
+
+  using SmemIteratorA0 = SmemIteratorA0_;
+  using SmemIteratorB0 = SmemIteratorB0_;
+  using SmemIteratorD0 = SmemIteratorD0_; ///< Iterates over accumulator tile in shared memory
+
+  using FragmentIteratorAccumulator = FragmentIteratorAccumulator_;  ///< Iterates over accumulator tile
+
+  using Shape1 = Shape1_;             ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using IteratorB1 = IteratorB1_;     ///< Iterates over tiles of B operand in global memory
+  using Policy1 = Policy1_;           ///< Policy1 describing tuning details
+
+  using SmemIteratorB1 = SmemIteratorB1_;
+  using WarpIteratorA1 = WarpIteratorA1_;   ///< Iterates over the intermediate accumulator tile in shared memory
+
+
+  using ElementC = ElementC_;       ///< Data type of accumulator matrix
+  using LayoutC = LayoutC_;         ///< Layout of accumulator matrix
+  
+  using OutputOp = OutputOp_;       ///< Epilogue after 1st Gemm
+
+  using TransformA0 = TransformA0_;
+  using TransformB0 = TransformB0_;
+  using TransformB1 = TransformB1_;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of operand A loaded from global memory
+  using FragmentA0 = typename IteratorA0::Fragment;
+
+  /// Fragment of operand B loaded from global memory
+  using FragmentB0 = typename IteratorB0::Fragment;
+
+  /// Fragment of accumulator tile
+  using FragmentC0 = typename Policy0::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator0 = typename Policy0::Operator;
+  
+  /// Fragment of operand B loaded from global memory
+  using FragmentB1 = typename IteratorB1::Fragment;
+
+  /// Fragment of accumulator tile
+  using FragmentC1 = typename Policy1::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator1 = typename Policy1::Operator;
+ 
+  /// Obtain the arch tag from the warp-level operator
+  using ArchTag = typename Policy0::Operator::ArchTag;
+
+  /// Complex transform on A0 operand
+  static ComplexTransform const kTransformA0 = Operator0::kTransformA;
+
+  /// Complex transform on B0 operand
+  static ComplexTransform const kTransformB0 = Operator0::kTransformB;
+
+  /// Complex transform on B1 operand
+  static ComplexTransform const kTransformB1 = Operator1::kTransformB;
+
+  /// staticaly assert kStages for MmaPipelined is two (Double-buffered pipeline)
+  static_assert((Base::kStages==2), "MmaPipelined requires kStages set to value 2");
+
+  /// Epilog in shared memory
+  using Epilogue0 = epilogue::threadblock::EpilogueSmemAccumulator<
+    SmemIteratorD0,                 ///< SmemTileIterator
+    FragmentIteratorAccumulator,    ///< AccumulatorFragmentIterator
+    IteratorAccumulatorScaleBias,   ///< ScaleBiasIterator
+    OutputOp>;                      ///< Output operator
+
+
+
+private:
+
+  using WarpFragmentA0 = typename Operator0::FragmentA;
+  using WarpFragmentB0 = typename Operator0::FragmentB;
+  using WarpFragmentA1 = typename Operator1::FragmentA;
+  using WarpFragmentB1 = typename Operator1::FragmentB;
+
+protected:
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA0 smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B0 operand to shared memory
+  SmemIteratorB0 smem_iterator_B0_;
+    
+  /// Shared Memory Iterator to store accumulator tile
+  SmemIteratorD0 smem_iterator_D0_;
+    
+  /// Iterator to load a warp-scoped tile of A1 operand from intermediate accumulator tile
+  WarpIteratorA1 warp_tile_iterator_A1_;
+
+  /// Iterator to write threadblock-scoped tile of B1 operand to shared memory
+  SmemIteratorB1 smem_iterator_B1_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  B2bMmaPipelinedSmemAccumulator(
+    typename Base::B2bMmaSharedStorage &shared_storage, ///< Shared storage needed for internal use by threadblock-scoped GEMM
+    int thread_idx,                                     ///< ID within the threadblock
+    int warp_idx,                                       ///< ID of warp
+    int lane_idx                                        ///< ID of each thread within a warp
+  ):
+    Base(shared_storage, thread_idx, warp_idx, lane_idx),
+    smem_iterator_A_(shared_storage.b2b_mma_shared_storage.shared_storage0.operand_A_ref(), thread_idx),
+    smem_iterator_B0_(shared_storage.b2b_mma_shared_storage.shared_storage0.operand_B_ref(), thread_idx),
+    smem_iterator_D0_(shared_storage.accumulator_shared_storage0.accum_ref(), lane_idx),
+    warp_tile_iterator_A1_(shared_storage.accumulator_shared_storage0.accum_ref(), lane_idx),
+    smem_iterator_B1_(shared_storage.b2b_mma_shared_storage.shared_storage1.operand_B_ref(), thread_idx) {
+
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn_0 = warp_idx % (Base::WarpCount0::kM * Base::WarpCount0::kN);
+    int warp_idx_k_0 = warp_idx / (Base::WarpCount0::kM * Base::WarpCount0::kN);
+
+    int warp_idx_m_0 = warp_idx_mn_0 % Base::WarpCount0::kM;
+    int warp_idx_n_0 = warp_idx_mn_0 / Base::WarpCount0::kM;
+
+    int tile_offset_k_0 = Base::kWarpGemmIterations0 * warp_idx_k_0;
+
+    int warp_idx_mn_1 = warp_idx % (Base::WarpCount1::kM * Base::WarpCount1::kN);
+    int warp_idx_k_1 = warp_idx / (Base::WarpCount1::kM * Base::WarpCount1::kN);
+
+    int warp_idx_m_1 = warp_idx_mn_1 % Base::WarpCount1::kM;
+    int warp_idx_n_1 = warp_idx_mn_1 / Base::WarpCount1::kM;
+
+    int tile_offset_k_1 = Base::kWarpGemmIterations1 * warp_idx_k_1;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A0_.add_tile_offset({warp_idx_m_0, tile_offset_k_0});
+    this->warp_tile_iterator_B0_.add_tile_offset({tile_offset_k_0, warp_idx_n_0});
+    warp_tile_iterator_A1_.add_tile_offset({warp_idx_m_1, tile_offset_k_1});
+    this->warp_tile_iterator_B1_.add_tile_offset({tile_offset_k_1, warp_idx_n_1});
+
+    // Add smem accumulator iterator warp offset
+    smem_iterator_D0_.add_tile_offset({ warp_idx_m_0 * SmemIteratorD0::TileIterations::kRow, 
+                                        warp_idx_n_0 * SmemIteratorD0::TileIterations::kColumn});
+
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+    int gemm_k_iterations_0,                             ///< number of iterations of the mainloop
+    FragmentC1 &accum,                                   ///< destination accumulator tile
+    IteratorA0 iterator_A,                               ///< iterator over A operand in global memory
+    IteratorB0 iterator_B0,                              ///< iterator over B0 operand in global memory
+    IteratorB1 iterator_B1,                              ///< iterator over B1 operand in global memory  
+    FragmentC0 const &src_accum,                         ///< source accumualtor tile
+    OutputOp output_op_0,                                ///< epilogue operation after 1st Gemm
+    TransformA0 transform_A0 = TransformA0(),            ///< transformation applied to A0 fragment
+    TransformB0 transform_B0 = TransformB0(),            ///< transformation applied to B0 fragment
+    TransformB1 transform_B1 = TransformB1()) {          ///< transformation applied to B1 fragment
+
+    //
+    // Prologue
+    //
+
+    // Perform accumulation in the 'd' output operand
+    FragmentC0 accum0 = src_accum;
+
+    FragmentA0 tb_frag_A;
+    FragmentB0 tb_frag_B0;
+
+    tb_frag_A.clear();
+    tb_frag_B0.clear();
+
+    // The last kblock is loaded in the prolog
+    iterator_A.load(tb_frag_A);
+    iterator_B0.load(tb_frag_B0);
+
+    ++iterator_A;
+    ++iterator_B0;
+
+    this->smem_iterator_A_.store(transform_A0(tb_frag_A));
+    this->smem_iterator_B0_.store(transform_B0(tb_frag_B0));
+
+    ++this->smem_iterator_A_;
+    ++this->smem_iterator_B0_;
+
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math instructions
+    WarpFragmentA0 warp_frag_A0[2];
+    WarpFragmentB0 warp_frag_B0[2];
+
+    this->warp_tile_iterator_A0_.set_kgroup_index(0);
+    this->warp_tile_iterator_B0_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A0_.load(warp_frag_A0[0]);
+    this->warp_tile_iterator_B0_.load(warp_frag_B0[0]);
+
+    ++this->warp_tile_iterator_A0_;
+    ++this->warp_tile_iterator_B0_;
+
+    Operator0 warp_mma0;
+
+    int smem_write_stage_idx = 1;
+
+    // Avoid reading out of bounds
+    iterator_A.clear_mask(gemm_k_iterations_0 <= 1);
+    iterator_B0.clear_mask(gemm_k_iterations_0 <= 1);
+
+    // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing 
+    // shared memory loads (which have the tighest latency requirement).
+
+    //
+    // Mainloop
+    //
+
+    // Note: The main loop does not support Base::kWarpGemmIterations == 2.
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations_0 > 0; --gemm_k_iterations_0) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations0; ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if this is the last group
+        // as the case may be.
+
+        if (warp_mma_k == Base::kWarpGemmIterations0 - 1) {
+
+          // Write fragments to shared memory
+          this->smem_iterator_A_.store(transform_A0(tb_frag_A));
+
+          this->smem_iterator_B0_.store(transform_B0(tb_frag_B0));
+
+          __syncthreads();
+          
+          ++this->smem_iterator_A_;
+          ++this->smem_iterator_B0_;
+
+          // Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory
+          if (smem_write_stage_idx == 1) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B0_.add_tile_offset({-Base::kStages, 0});
+          }
+          else {
+            this->warp_tile_iterator_A0_.add_tile_offset(
+                {0, -Base::kStages * Policy0::kPartitionsK * Base::kWarpGemmIterations0});
+            this->warp_tile_iterator_B0_.add_tile_offset(
+                {-Base::kStages * Policy0::kPartitionsK * Base::kWarpGemmIterations0,
+                 0});
+          }
+
+          smem_write_stage_idx ^= 1;
+        }
+
+        this->warp_tile_iterator_A0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0);
+        this->warp_tile_iterator_B0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0);
+        
+        this->warp_tile_iterator_A0_.load(warp_frag_A0[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_B0_.load(warp_frag_B0[(warp_mma_k + 1) % 2]);
+
+        ++this->warp_tile_iterator_A0_;
+        ++this->warp_tile_iterator_B0_;
+
+        if (warp_mma_k == 0) {
+
+          iterator_A.load(tb_frag_A);
+          iterator_B0.load(tb_frag_B0);
+          ++iterator_A;
+          ++iterator_B0;
+
+          // Avoid reading out of bounds if this was the last loop iteration
+          iterator_A.clear_mask(gemm_k_iterations_0 <= 2);
+          iterator_B0.clear_mask(gemm_k_iterations_0 <= 2);
+        }
+
+        warp_mma0(accum0, warp_frag_A0[warp_mma_k % 2],
+                  warp_frag_B0[warp_mma_k % 2], accum0);
+      }
+    }
+
+    /// Epilogue for the first Implicit Gemm
+    Epilogue0 epilogue0;
+
+    epilogue0(output_op_0, smem_iterator_D0_, accum0);
+
+    __syncthreads();
+ 
+    //2nd Gemm
+
+    //
+    // Prologue
+    //
+
+    FragmentB1 tb_frag_B1;
+
+    tb_frag_B1.clear();
+
+    // The last kblock is loaded in the prolog
+    iterator_B1.load(tb_frag_B1);
+
+    ++iterator_B1;
+
+    this->smem_iterator_B1_.store(transform_B1(tb_frag_B1));
+
+    ++this->smem_iterator_B1_;
+
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math instructions
+    WarpFragmentA1 warp_frag_A1[2];
+    WarpFragmentB1 warp_frag_B1[2];
+
+    this->warp_tile_iterator_B1_.set_kgroup_index(0);
+
+    warp_tile_iterator_A1_.load(warp_frag_A1[0]);
+    this->warp_tile_iterator_B1_.load(warp_frag_B1[0]);
+
+    ++warp_tile_iterator_A1_;
+    ++this->warp_tile_iterator_B1_;
+
+    Operator1 warp_mma1;
+
+    smem_write_stage_idx = 1;
+    
+    int gemm_k_iterations_1 = Shape0::kN / Shape1::kK;
+
+    // Avoid reading out of bounds
+    iterator_B1.clear_mask(gemm_k_iterations_1 <= 1);
+
+    //
+    // Mainloop
+    //
+
+    // Note: The main loop does not support Base::kWarpGemmIterations == 2.
+    CUTLASS_PRAGMA_UNROLL
+    for (; gemm_k_iterations_1 > 0; --gemm_k_iterations_1) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations1; ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if this is the last group
+        // as the case may be.
+
+        if (warp_mma_k == Base::kWarpGemmIterations1 - 1) {
+
+          // Write fragments to shared memory
+          this->smem_iterator_B1_.store(transform_B1(tb_frag_B1));
+
+          __syncthreads();
+          
+          ++this->smem_iterator_B1_;
+
+          // Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory
+          if (smem_write_stage_idx == 1) {
+            this->smem_iterator_B1_.add_tile_offset({-Base::kStages, 0});
+          }
+          else {
+            this->warp_tile_iterator_B1_.add_tile_offset(
+                {-Base::kStages * Policy1::kPartitionsK *
+                     Base::kWarpGemmIterations1,
+                 0});
+          }
+
+          smem_write_stage_idx ^= 1;
+
+        }
+
+        this->warp_tile_iterator_B1_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations1);
+        
+        // skip warp tile loading for the last kgroup
+        if(gemm_k_iterations_1 > 1 || warp_mma_k < Base::kWarpGemmIterations1 - 1)
+          warp_tile_iterator_A1_.load(warp_frag_A1[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_B1_.load(warp_frag_B1[(warp_mma_k + 1) % 2]);
+
+        ++warp_tile_iterator_A1_;
+        ++this->warp_tile_iterator_B1_;
+
+        if (warp_mma_k == 0) {
+
+          iterator_B1.load(tb_frag_B1);
+    
+          ++iterator_B1;
+
+          // Avoid reading out of bounds if this was the last loop iteration
+          iterator_B1.clear_mask(gemm_k_iterations_1 <= 2);
+        }
+
+        warp_mma1(accum, warp_frag_A1[warp_mma_k % 2],
+                 warp_frag_B1[warp_mma_k % 2], accum);
+
+      }
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
diff --git a/examples/13_two_tensor_op_fusion/threadblock/default_b2b_mma.h b/examples/13_two_tensor_op_fusion/threadblock/default_b2b_mma.h
index 7cd95065..02e8e206 100644
--- a/examples/13_two_tensor_op_fusion/threadblock/default_b2b_mma.h
+++ b/examples/13_two_tensor_op_fusion/threadblock/default_b2b_mma.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -89,7 +95,9 @@ template <
     typename EpilogueOutputOp,
     /// Store the accumulators in row major or column major.  Row major is used
     /// when output layout is interleaved.
-    bool AccumulatorsInRowMajor = false>
+    bool AccumulatorsInRowMajor = false,
+    /// Staging the accumulators in shared memory.
+    bool SmemAccumulator = false>
 struct DefaultB2bMma;
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/13_two_tensor_op_fusion/threadblock/default_b2b_mma_smem_accumulator.h b/examples/13_two_tensor_op_fusion/threadblock/default_b2b_mma_smem_accumulator.h
new file mode 100644
index 00000000..b0a76896
--- /dev/null
+++ b/examples/13_two_tensor_op_fusion/threadblock/default_b2b_mma_smem_accumulator.h
@@ -0,0 +1,605 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h"
+
+#include "threadblock/b2b_mma_pipelined_smem_accumulator.h"
+#include "threadblock/b2b_mma_multistage_smem_accumulator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization for row-major output with 2-stage pipeline
+/// Accumulator will be staged in shared memory.
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape0,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape1,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape0,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape1,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Epilogue output operator
+    typename EpilogueOutputOp>
+struct DefaultB2bMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, ElementAccumulator, layout::RowMajor,
+                  arch::OpClassTensorOp, ArchTag, 
+                  ThreadblockShape0, ThreadblockShape1,
+                  WarpShape0, WarpShape1,
+                  InstructionShape, 2, Operator, EpilogueOutputOp, false, true> {
+  // Define the MmaCore components
+  using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape0, WarpShape0, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor,
+      arch::OpClassTensorOp, 2, Operator>;
+  using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape1, WarpShape1, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor,
+      arch::OpClassTensorOp, 2, Operator>;
+
+  // Define iterators over tiles from the A operand
+  using IteratorA0 =
+      cutlass::transform::threadblock::PredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore0::Shape::kM, MmaCore0::Shape::kK>,
+          ElementA, LayoutA, 1, typename MmaCore0::IteratorThreadMapA, kAlignmentA>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB0 =
+      cutlass::transform::threadblock::PredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore0::Shape::kK, MmaCore0::Shape::kN>,
+          ElementB, LayoutB, 0, typename MmaCore0::IteratorThreadMapB, kAlignmentB>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB1 =
+      cutlass::transform::threadblock::PredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore1::Shape::kK, MmaCore1::Shape::kN>,
+          ElementB, LayoutB, 0, typename MmaCore1::IteratorThreadMapB, kAlignmentB>;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp0 = typename MmaCore0::MmaTensorOp;
+  using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp;
+
+  // Use fragment iterator for the accumulator
+  using SmemAccumulatorLayout = cutlass::layout::RowMajor;
+  using FragmentIteratorAccumulator = cutlass::epilogue::warp::FragmentIteratorTensorOp<
+          WarpShape0, InstructionShape,
+          ElementAccumulator,
+          typename WarpMmaTensorOp0::Policy::Operator::FragmentC,
+          SmemAccumulatorLayout
+        >;
+
+  /// Define iterators over tiles from scale/bias vectors
+  using ElementScaleBias = typename EpilogueOutputOp::ElementCompute;
+  using LayoutScaleBias = layout::RowMajor; //vector layout doesn't really matter
+  static int const kElementsPerAccess = 2;
+  using IteratorAccumulatorScaleBias =
+    cutlass::transform::threadblock::VectorIterator<
+      cutlass::transform::threadblock::PredicatedVectorAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kN>, 
+          cutlass::MatrixShape<WarpShape0::kM, WarpShape0::kN>, 
+          ElementScaleBias, LayoutScaleBias, kElementsPerAccess>
+    >;
+
+  // Store Accumulator tiles to Shared Memory
+  using SmemIteratorD0 = 
+      cutlass::epilogue::warp::TileIteratorTensorOp<
+          WarpShape0,
+          InstructionShape,
+          typename EpilogueOutputOp::ElementOutput,
+          SmemAccumulatorLayout
+        >;
+
+  static int const kThreadCount = 32;
+  // load warp tile from Shared Memory accumulator
+  using WarpIteratorA1 = cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator<
+    MatrixShape<WarpShape1::kM, InstructionShape::kK>, cutlass::gemm::Operand::kA, 
+    ElementA, SmemAccumulatorLayout,
+    MatrixShape<InstructionShape::kM, InstructionShape::kK>,
+    WarpMmaTensorOp1::Policy::OpDelta::kRow, kThreadCount>;
+ 
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockB2bMma = cutlass::gemm::threadblock::B2bMmaPipelinedSmemAccumulator<
+      typename MmaCore0::Shape, IteratorA0, typename MmaCore0::SmemIteratorA,
+      IteratorB0, typename MmaCore0::SmemIteratorB,
+      IteratorAccumulatorScaleBias,
+      FragmentIteratorAccumulator, SmemIteratorD0,
+      typename MmaCore1::Shape, WarpIteratorA1,
+      IteratorB1, typename MmaCore1::SmemIteratorB, 
+      ElementAccumulator, layout::RowMajor,
+      EpilogueOutputOp,
+      typename MmaCore0::MmaPolicy, typename MmaCore1::MmaPolicy>;
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization for row-major output for multi-stage
+/// Accumulator will be staged in shared memory.
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape0,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape1,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape0,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape1,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Epilogue output operator
+    typename EpilogueOutputOp>
+struct DefaultB2bMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, ElementAccumulator, layout::RowMajor,
+                  arch::OpClassTensorOp, ArchTag, 
+                  ThreadblockShape0, ThreadblockShape1,
+                  WarpShape0, WarpShape1,
+                  InstructionShape, Stages, Operator, EpilogueOutputOp, false, true> {
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpA =
+      ((sizeof_bits<ElementA>::value * kAlignmentA) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * kAlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+ 
+  // Define the MmaCore components
+  using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape0, WarpShape0, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, 
+      Stages, Operator, false, CacheOpA, CacheOpB>;
+  using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape1, WarpShape1, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, 
+      Stages, Operator, false, CacheOpA, CacheOpB>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA0 = typename MmaCore0::IteratorThreadMapA;
+  using AccessTypeA0 = cutlass::Array<ElementA, kAlignmentA>;
+  using IteratorA0 =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
+          ElementA, LayoutA, 1, ThreadMapA0, AccessTypeA0>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB0 = typename MmaCore0::IteratorThreadMapB;
+  using AccessTypeB0 = cutlass::Array<ElementB, kAlignmentB>;
+  using IteratorB0 =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
+          ElementB, LayoutB, 0, ThreadMapB0, AccessTypeB0>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB1 = typename MmaCore1::IteratorThreadMapB;
+  using AccessTypeB1 = cutlass::Array<ElementB, kAlignmentB>;
+  using IteratorB1 =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
+          ElementB, LayoutB, 0, ThreadMapB1, AccessTypeB1>;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp0 = typename MmaCore0::MmaTensorOp;
+  using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp;
+
+  // Use fragment iterator for the accumulator
+  using SmemAccumulatorLayout = cutlass::layout::RowMajor;
+  using FragmentIteratorAccumulator = cutlass::epilogue::warp::FragmentIteratorTensorOp<
+          WarpShape0, InstructionShape,
+          ElementAccumulator,
+          typename WarpMmaTensorOp0::Policy::Operator::FragmentC,
+          SmemAccumulatorLayout
+        >;
+
+  /// Define iterators over tiles from scale/bias vectors
+  using ElementScaleBias = typename EpilogueOutputOp::ElementCompute;
+  using LayoutScaleBias = layout::RowMajor; //vector layout doesn't really matter
+  static int const kElementsPerAccess = 2;
+  using IteratorAccumulatorScaleBias =
+    cutlass::transform::threadblock::VectorIterator<
+      cutlass::transform::threadblock::PredicatedVectorAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kN>,
+          cutlass::MatrixShape<WarpShape0::kM, WarpShape0::kN>,
+          ElementScaleBias, LayoutScaleBias, kElementsPerAccess>
+    >;
+
+
+  // Store Accumulator tiles to Shared Memory
+  using SmemIteratorD0 = 
+      cutlass::epilogue::warp::TileIteratorTensorOp<
+          WarpShape0,
+          InstructionShape,
+          typename EpilogueOutputOp::ElementOutput,
+          SmemAccumulatorLayout
+        >;
+
+  static int const kThreadCount = 32;
+  // load warp tile from Shared Memory accumulator
+  using WarpIteratorA1 = cutlass::gemm::warp::MmaTensorOpMultiplicandTileIterator<
+    MatrixShape<WarpShape1::kM, InstructionShape::kK>, cutlass::gemm::Operand::kA, 
+    ElementA, SmemAccumulatorLayout,
+    MatrixShape<InstructionShape::kM, InstructionShape::kK>,
+    WarpMmaTensorOp1::Policy::OpDelta::kRow, kThreadCount>;
+ 
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockB2bMma = cutlass::gemm::threadblock::B2bMmaMultistageSmemAccumulator<
+      typename MmaCore0::Shape, IteratorA0, typename MmaCore0::SmemIteratorA,
+      MmaCore0::kCacheOpA, 
+      IteratorB0, typename MmaCore0::SmemIteratorB, MmaCore0::kCacheOpB,
+      IteratorAccumulatorScaleBias,
+      FragmentIteratorAccumulator, SmemIteratorD0,
+      typename MmaCore1::Shape, WarpIteratorA1,
+      IteratorB1, typename MmaCore1::SmemIteratorB, MmaCore1::kCacheOpB,
+      ElementAccumulator, layout::RowMajor,
+      EpilogueOutputOp,
+      typename MmaCore0::MmaPolicy, typename MmaCore1::MmaPolicy, Stages>;
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for column-major-interleaved output with 2-stage pipeline
+/// Accumulator will be staged in shared memory.
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename OperatorClass,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape0,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape1,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape0,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape1,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Number of Interleaved K
+    int InterleavedK>
+struct DefaultB2bMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, ElementAccumulator,
+                  layout::ColumnMajorInterleaved<InterleavedK>, OperatorClass, arch::Sm75, 
+                  ThreadblockShape0, ThreadblockShape1, WarpShape0, WarpShape1,
+                  InstructionShape, 2, Operator, EpilogueOutputOp, true, true> {
+  // Define the MmaCore components
+  using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape0, WarpShape0, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator,
+      layout::ColumnMajorInterleaved<InterleavedK>, OperatorClass, 2, Operator, 
+      true>;
+  using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape1, WarpShape1, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator,
+      layout::ColumnMajorInterleaved<InterleavedK>, OperatorClass, 2, Operator,
+      true>;
+
+  static_assert(kAlignmentA == 128 / sizeof_bits<ElementA>::value, 
+    "Alignment must match thread data map's vector length");
+
+  static_assert(kAlignmentB ==128 / sizeof_bits<ElementB>::value,
+    "Alignment must match thread data map's vector length");
+
+  // Define iterators over tiles from the A operand
+  using IteratorA0 = cutlass::transform::threadblock::PredicatedTileIterator<
+      cutlass::MatrixShape<MmaCore0::Shape::kM, MmaCore0::Shape::kK>, ElementA,
+      LayoutA, 1, typename MmaCore0::IteratorThreadMapA>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB0 = cutlass::transform::threadblock::PredicatedTileIterator<
+      cutlass::MatrixShape<MmaCore0::Shape::kK, MmaCore0::Shape::kN>, ElementB,
+      LayoutB, 0, typename MmaCore0::IteratorThreadMapB>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB1 =
+      cutlass::transform::threadblock::PredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore1::Shape::kK, MmaCore1::Shape::kN>,
+          ElementB, LayoutB, 0, typename MmaCore1::IteratorThreadMapB>;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp0 = typename MmaCore0::MmaTensorOp;
+  using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp;
+
+  // Use fragment iterator for the accumulator
+  using SmemAccumulatorLayout = cutlass::layout::ColumnMajorInterleaved<16>;
+  using FragmentIteratorAccumulator = cutlass::epilogue::warp::FragmentIteratorTensorOp<
+          WarpShape0, InstructionShape,
+          ElementAccumulator,
+          typename WarpMmaTensorOp0::Policy::Operator::FragmentC,
+          SmemAccumulatorLayout
+        >;
+
+  /// Define iterators over tiles from scale/bias vectors
+  using ElementScaleBias = typename EpilogueOutputOp::ElementCompute;
+  using LayoutScaleBias = layout::RowMajor; //vector layout doesn't really matter
+  static int const kElementsPerAccess = 4; //For interleaved layout
+  using IteratorAccumulatorScaleBias =
+    cutlass::transform::threadblock::VectorIterator<
+      cutlass::transform::threadblock::PredicatedVectorAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kN>, 
+          cutlass::MatrixShape<WarpShape0::kM, WarpShape0::kN>, 
+          ElementScaleBias, LayoutScaleBias, kElementsPerAccess>
+    >;
+ 
+  // Store Accumulator tiles to Shared Memory
+  using SmemIteratorD0 = 
+      cutlass::epilogue::warp::TileIteratorTensorOp<
+          WarpShape0,
+          InstructionShape,
+          typename EpilogueOutputOp::ElementOutput,
+          SmemAccumulatorLayout
+        >;
+
+  static int const kThreadCount = 32;
+  // load warp tile from Shared Memory accumulator
+  using WarpIteratorA1 = cutlass::gemm::warp::MmaTensorOpMultiplicandTileIteratorCanonical<
+    MatrixShape<WarpShape1::kM, InstructionShape::kK>, cutlass::gemm::Operand::kA, 
+    ElementA, SmemAccumulatorLayout,
+    MatrixShape<InstructionShape::kM, InstructionShape::kK>,
+    WarpMmaTensorOp1::Policy::OpDelta::kRow, kThreadCount>;
+ 
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockB2bMma = cutlass::gemm::threadblock::B2bMmaPipelinedSmemAccumulator<
+      typename MmaCore0::Shape, IteratorA0, typename MmaCore0::SmemIteratorA,
+      IteratorB0, typename MmaCore0::SmemIteratorB,
+      IteratorAccumulatorScaleBias, 
+      FragmentIteratorAccumulator, SmemIteratorD0,
+      typename MmaCore1::Shape, WarpIteratorA1,
+      IteratorB1, typename MmaCore1::SmemIteratorB, 
+      ElementAccumulator, layout::ColumnMajorInterleaved<InterleavedK>,
+      EpilogueOutputOp,
+      typename MmaCore0::MmaPolicy, typename MmaCore1::MmaPolicy>;
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization for column-major-interleaved output with multi-stage
+/// Accumulator will be staged in shared memory.
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape0,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape1,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape0,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape1,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Number of Interleaved K
+    int InterleavedK>
+struct DefaultB2bMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, ElementAccumulator,
+                  layout::ColumnMajorInterleaved<InterleavedK>, OperatorClass, ArchTag, 
+                  ThreadblockShape0, ThreadblockShape1, WarpShape0, WarpShape1,
+                  InstructionShape, Stages, Operator, EpilogueOutputOp, true, true> {
+  // Define the MmaCore components
+  using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape0, WarpShape0, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator,
+      layout::ColumnMajorInterleaved<InterleavedK>, OperatorClass, Stages,
+      Operator, true>;
+  using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape1, WarpShape1, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator,
+      layout::ColumnMajorInterleaved<InterleavedK>, OperatorClass, Stages,
+      Operator, true>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA0 = typename MmaCore0::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
+  using IteratorA0 =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
+          ElementA, LayoutA, 1, ThreadMapA0, AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB0 = typename MmaCore0::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
+  using IteratorB0 =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
+          ElementB, LayoutB, 0, ThreadMapB0, AccessTypeB>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB1 = typename MmaCore1::IteratorThreadMapB;
+  using IteratorB1 =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
+          ElementB, LayoutB, 0, ThreadMapB1, AccessTypeB>;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp0 = typename MmaCore0::MmaTensorOp;
+  using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp;
+  using MmaPolicy0 = typename MmaCore0::MmaPolicy;
+  using MmaPolicy1 = typename MmaCore1::MmaPolicy;
+
+  // Use fragment iterator for the accumulator
+  using SmemAccumulatorLayout = cutlass::layout::ColumnMajorInterleaved<16>;
+  using FragmentIteratorAccumulator = cutlass::epilogue::warp::FragmentIteratorTensorOp<
+          WarpShape0, InstructionShape,
+          ElementAccumulator,
+          typename WarpMmaTensorOp0::Policy::Operator::FragmentC,
+          SmemAccumulatorLayout
+        >;
+
+  /// Define iterators over tiles from scale/bias vectors
+  using ElementScaleBias = typename EpilogueOutputOp::ElementCompute;
+  using LayoutScaleBias = layout::RowMajor; //vector layout doesn't really matter
+  static int const kElementsPerAccess = 4;
+  using IteratorAccumulatorScaleBias =
+    cutlass::transform::threadblock::VectorIterator<
+      cutlass::transform::threadblock::PredicatedVectorAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kN>, 
+          cutlass::MatrixShape<WarpShape0::kM, WarpShape0::kK>, 
+          ElementScaleBias, LayoutScaleBias, kElementsPerAccess>
+    >;
+
+  // Store Accumulator tiles to Shared Memory
+  using SmemIteratorD0 = 
+      cutlass::epilogue::warp::TileIteratorTensorOp<
+          WarpShape0,
+          InstructionShape,
+          typename EpilogueOutputOp::ElementOutput,
+          SmemAccumulatorLayout
+        >;
+
+  static int const kThreadCount = 32;
+  // load warp tile from Shared Memory accumulator
+  using WarpIteratorA1 = cutlass::gemm::warp::MmaTensorOpMultiplicandTileIteratorCanonical<
+    MatrixShape<WarpShape1::kM, InstructionShape::kK>, cutlass::gemm::Operand::kA, 
+    ElementA, SmemAccumulatorLayout,
+    MatrixShape<InstructionShape::kM, InstructionShape::kK>,
+    WarpMmaTensorOp1::Policy::OpDelta::kRow, kThreadCount>;
+ 
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockB2bMma = cutlass::gemm::threadblock::B2bMmaMultistageSmemAccumulator<
+      typename MmaCore0::Shape, IteratorA0, typename MmaCore0::SmemIteratorA,
+      MmaCore0::kCacheOpA, 
+      IteratorB0, typename MmaCore0::SmemIteratorB, MmaCore0::kCacheOpB,
+      IteratorAccumulatorScaleBias, 
+      FragmentIteratorAccumulator, SmemIteratorD0,
+      typename MmaCore1::Shape, WarpIteratorA1,
+      IteratorB1, typename MmaCore1::SmemIteratorB, MmaCore1::kCacheOpB, 
+      ElementAccumulator, layout::ColumnMajorInterleaved<InterleavedK>,
+      EpilogueOutputOp,
+      typename MmaCore0::MmaPolicy, typename MmaCore1::MmaPolicy, Stages>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass 
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/14_ampere_tf32_tensorop_gemm/CMakeLists.txt b/examples/14_ampere_tf32_tensorop_gemm/CMakeLists.txt
index eaf892cf..23b8f0dd 100644
--- a/examples/14_ampere_tf32_tensorop_gemm/CMakeLists.txt
+++ b/examples/14_ampere_tf32_tensorop_gemm/CMakeLists.txt
@@ -1,25 +1,33 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+
 cutlass_example_add_executable(
   14_ampere_tf32_tensorop_gemm
   ampere_tf32_tensorop_gemm.cu
diff --git a/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm.cu b/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm.cu
index da36c67b..eed9f9b2 100644
--- a/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm.cu
+++ b/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -132,12 +138,12 @@ struct Options {
       << "  This example uses the CUTLASS Library to execute TF32 tensorop GEMM computations.\n\n"
       << "Options:\n\n"
       << "  --help                      If specified, displays this usage statement.\n\n"
-      << "  --m <int>                   GEMM M dimension\n"
-      << "  --n <int>                   GEMM N dimension\n"
-      << "  --k <int>                   GEMM K dimension\n"
-      << "  --alpha <f32>               Epilogue scalar alpha\n"
-      << "  --beta <f32>                Epilogue scalar beta\n\n"
-      << "  --iterations <int>          Number of profiling iterations to perform.\n\n";
+      << "  --m=<int>                   GEMM M dimension\n"
+      << "  --n=<int>                   GEMM N dimension\n"
+      << "  --k=<int>                   GEMM K dimension\n"
+      << "  --alpha=<f32>               Epilogue scalar alpha\n"
+      << "  --beta=<f32>                Epilogue scalar beta\n\n"
+      << "  --iterations=<int>          Number of profiling iterations to perform.\n\n";
 
     out << "\n\nExamples:\n\n"
       << "$ ./examples/14_ampere_tf32_tensorop_gemm/14_ampere_tf32_tensorop_gemm --m=1024 --n=512 --k=1024 \\\n"
diff --git a/examples/15_ampere_sparse_tensorop_gemm/CMakeLists.txt b/examples/15_ampere_sparse_tensorop_gemm/CMakeLists.txt
index 97690fc8..fbd20df8 100644
--- a/examples/15_ampere_sparse_tensorop_gemm/CMakeLists.txt
+++ b/examples/15_ampere_sparse_tensorop_gemm/CMakeLists.txt
@@ -1,25 +1,33 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+
 cutlass_example_add_executable(
   15_ampere_sparse_tensorop_gemm
   ampere_sparse_tensorop_gemm.cu
diff --git a/examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm.cu b/examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm.cu
index d6c70006..4a25a963 100644
--- a/examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm.cu
+++ b/examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/examples/16_ampere_tensorop_conv2dfprop/CMakeLists.txt b/examples/16_ampere_tensorop_conv2dfprop/CMakeLists.txt
index 4524a836..e3afbb96 100644
--- a/examples/16_ampere_tensorop_conv2dfprop/CMakeLists.txt
+++ b/examples/16_ampere_tensorop_conv2dfprop/CMakeLists.txt
@@ -1,26 +1,34 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
+
 cutlass_example_add_executable(
   16_ampere_tensorop_conv2dfprop
   ampere_tensorop_conv2dfprop.cu
diff --git a/examples/16_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu b/examples/16_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu
index a4163d97..a35a3946 100644
--- a/examples/16_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu
+++ b/examples/16_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2022, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -321,21 +327,21 @@ struct Options {
       << "  forward convolution on tensors of layout NHWC.\n\n"
       << "Options:\n\n"
       << "  --help               If specified, displays this usage statement.\n\n"
-      << "  --n <int>            Input tensor extent N\n"
-      << "  --h <int>            Input tensor extent H\n"
-      << "  --w <int>            Input tensor extent W\n"
-      << "  --c <int>            Input tensor extent C\n"
-      << "  --k <int>            Filter extent K\n"
-      << "  --r <int>            Filter extent R\n"
-      << "  --s <int>            Filter extent S\n\n"
-      << "  --alpha <float>      Epilogue scalar alpha\n"
-      << "  --beta <float>       Epilogue scalar beta\n\n"
+      << "  --n=<int>            Input tensor extent N\n"
+      << "  --h=<int>            Input tensor extent H\n"
+      << "  --w=<int>            Input tensor extent W\n"
+      << "  --c=<int>            Input tensor extent C\n"
+      << "  --k=<int>            Filter extent K\n"
+      << "  --r=<int>            Filter extent R\n"
+      << "  --s=<int>            Filter extent S\n\n"
+      << "  --alpha=<float>      Epilogue scalar alpha\n"
+      << "  --beta=<float>       Epilogue scalar beta\n\n"
       << "  --ref-check          If set (true), reference check on the host is computed\n"
       << "  --perf-check         If set (true), performance is measured.\n"
       << "  --benchmark          If set (true), performance benchmarking on several layers and batch-size.\n"
-      << "  --iterations <int>   Number of profiling iterations to perform.\n"
+      << "  --iterations=<int>   Number of profiling iterations to perform.\n"
       << "  --save-workspace     If set, workspace is written to a text file.\n"
-      << "  --tag <string>       String to replicate across the first column in the results table\n";
+      << "  --tag=<string>       String to replicate across the first column in the results table\n";
 
     out << "\n\nExamples:\n\n"
       << "$ ./examples/16_ampere_tensorop_conv2dfprop/16_ampere_tensorop_conv2dfprop  --n=32 --h=224 --w=224 --c=128 --k=256 --r=1 --s=1\n\n"
@@ -427,7 +433,8 @@ Result profile_convolution(Options const &options) {
   cutlass::HostTensor<ElementInputA, LayoutInputA> tensor_a(options.input_size);
   cutlass::HostTensor<ElementInputB, LayoutInputB> tensor_b(options.filter_size);
   cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_c(options.output_size());
-  cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_ref_c(options.output_size());
+  cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_d(options.output_size());
+  cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_ref_d(options.output_size());
 
   //
   // Initialize tensors
@@ -453,15 +460,20 @@ Result profile_convolution(Options const &options) {
   cutlass::reference::host::TensorFill(
       tensor_c.host_view());
 
-  // Fill tensor C for reference on host with zeros
+  // Fill tensor D on host with zeros
   cutlass::reference::host::TensorFill(
-      tensor_ref_c.host_view());
+      tensor_d.host_view());
+
+  // Fill tensor D for reference on host with zeros
+  cutlass::reference::host::TensorFill(
+      tensor_ref_d.host_view());
 
   // Copy data from host to GPU
   tensor_a.sync_device();
   tensor_b.sync_device();
   tensor_c.sync_device();
-  tensor_ref_c.sync_device();
+  tensor_d.sync_device();
+  tensor_ref_d.sync_device();
 
   //
   // Define arguments for CUTLASS Convolution
@@ -491,7 +503,7 @@ Result profile_convolution(Options const &options) {
     tensor_a.device_ref(),
     tensor_b.device_ref(),
     tensor_c.device_ref(),
-    tensor_c.device_ref(),
+    tensor_d.device_ref(),
     {options.alpha, options.beta},
   };
 
@@ -542,17 +554,17 @@ Result profile_convolution(Options const &options) {
       tensor_a.host_ref(),
       tensor_b.host_ref(),
       tensor_c.host_ref(),
-      tensor_ref_c.host_ref(),
+      tensor_ref_d.host_ref(),
       options.alpha,
       options.beta
     );
 
     // Check if output from CUTLASS kernel and reference kernel are equal or not
-    tensor_c.sync_host();
+    tensor_d.sync_host();
 
     bool passed = cutlass::reference::host::TensorEquals(
-      tensor_c.host_view(),
-      tensor_ref_c.host_view());
+      tensor_d.host_view(),
+      tensor_ref_d.host_view());
 
     if (!passed) {
       result.reference_check = cutlass::Status::kErrorInternal;
@@ -584,10 +596,10 @@ Result profile_convolution(Options const &options) {
       << "Filters = \n" << tensor_b.host_view() << "\n\n";
 
     if (options.reference_check) {
-      output_workspace << "Reference = \n" << tensor_ref_c.host_view() << "\n\n";
+      output_workspace << "Reference = \n" << tensor_ref_d.host_view() << "\n\n";
     }
 
-    output_workspace << "Computed = \n" << tensor_c.host_view() << std::endl;
+    output_workspace << "Computed = \n" << tensor_d.host_view() << std::endl;
 
     std::cout << "Results written to '" << ss.str() << "'." << std::endl;
   }
diff --git a/examples/17_fprop_per_channel_bias/CMakeLists.txt b/examples/17_fprop_per_channel_bias/CMakeLists.txt
index 05a92aa8..5ca41a41 100644
--- a/examples/17_fprop_per_channel_bias/CMakeLists.txt
+++ b/examples/17_fprop_per_channel_bias/CMakeLists.txt
@@ -1,26 +1,34 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
+
 cutlass_example_add_executable(
   17_fprop_per_channel_bias 
   fprop_per_channel_bias.cu
diff --git a/examples/17_fprop_per_channel_bias/fprop_per_channel_bias.cu b/examples/17_fprop_per_channel_bias/fprop_per_channel_bias.cu
index cce5edda..2b6b25c7 100644
--- a/examples/17_fprop_per_channel_bias/fprop_per_channel_bias.cu
+++ b/examples/17_fprop_per_channel_bias/fprop_per_channel_bias.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/examples/18_ampere_fp64_tensorop_affine2_gemm/CMakeLists.txt b/examples/18_ampere_fp64_tensorop_affine2_gemm/CMakeLists.txt
index f20f666b..21a21e90 100644
--- a/examples/18_ampere_fp64_tensorop_affine2_gemm/CMakeLists.txt
+++ b/examples/18_ampere_fp64_tensorop_affine2_gemm/CMakeLists.txt
@@ -1,26 +1,34 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
+
 cutlass_example_add_executable(
   18_ampere_fp64_tensorop_affine2_gemm
   ampere_fp64_tensorop_affine2_gemm.cu
diff --git a/examples/18_ampere_fp64_tensorop_affine2_gemm/ampere_fp64_tensorop_affine2_gemm.cu b/examples/18_ampere_fp64_tensorop_affine2_gemm/ampere_fp64_tensorop_affine2_gemm.cu
index b32b20ab..62450e21 100644
--- a/examples/18_ampere_fp64_tensorop_affine2_gemm/ampere_fp64_tensorop_affine2_gemm.cu
+++ b/examples/18_ampere_fp64_tensorop_affine2_gemm/ampere_fp64_tensorop_affine2_gemm.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -46,8 +52,7 @@ This example uses Ampere fp64 tensore core Affine2 GEMM as an example.  SIMT
 #include <sstream>
 
 #include "cutlass/cutlass.h"
-#include "cutlass/gemm/device/gemm_universal_adapter.h"
-#include "cutlass/gemm/kernel/default_gemm_with_k_reduction.h"
+#include "cutlass/gemm/device/gemm_universal.h"
 #include "cutlass/reduction/device/reduce_split_k.h"
 #include "cutlass/reduction/kernel/reduce_split_k.h"
 #include "cutlass/reduction/thread/reduction_operators.h"
@@ -110,13 +115,13 @@ using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
     ElementOutput,                                        // Data type of output matrix.
     1,                                                    // The number of elements per memory
                                                           // access has.  It has to be 1 for
-                                                          // affine2.
+                                                          // affine2. 
     ElementAccumulator,
     ElementComputeEpilogue>;
 
-using GemmKernel = typename cutlass::gemm::kernel::DefaultGemmUniversal<
-  ElementInputA, LayoutInputA, cutlass::ComplexTransform::kNone, 1, // AlignmentA has to be 1
-  ElementInputB, LayoutInputB, cutlass::ComplexTransform::kNone, 1, // AlignmentB has to be 1
+using Gemm = typename cutlass::gemm::device::GemmUniversal<
+  ElementInputA, LayoutInputA,
+  ElementInputB, LayoutInputB,
   ElementOutput, LayoutOutput,
   ElementAccumulator,
   MMAOp,
@@ -127,10 +132,10 @@ using GemmKernel = typename cutlass::gemm::kernel::DefaultGemmUniversal<
   EpilogueOp,
   SwizzleThreadBlock,
   NumStages,
+  1,
+  1,
   cutlass::arch::OpMultiplyAdd
->::GemmKernel;
-
-using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+>;
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
diff --git a/examples/19_tensorop_canonical/CMakeLists.txt b/examples/19_tensorop_canonical/CMakeLists.txt
index 9b78ea47..c48ff260 100644
--- a/examples/19_tensorop_canonical/CMakeLists.txt
+++ b/examples/19_tensorop_canonical/CMakeLists.txt
@@ -1,25 +1,33 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+
 cutlass_example_add_executable(
   19_tensorop_canonical
   tensorop_canonical.cu
diff --git a/examples/19_tensorop_canonical/tensorop_canonical.cu b/examples/19_tensorop_canonical/tensorop_canonical.cu
index 6bb7f34e..15ad17f0 100644
--- a/examples/19_tensorop_canonical/tensorop_canonical.cu
+++ b/examples/19_tensorop_canonical/tensorop_canonical.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/examples/20_simt_canonical/CMakeLists.txt b/examples/20_simt_canonical/CMakeLists.txt
index 18fef93f..25f2969f 100644
--- a/examples/20_simt_canonical/CMakeLists.txt
+++ b/examples/20_simt_canonical/CMakeLists.txt
@@ -1,25 +1,33 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+
 cutlass_example_add_executable(
   20_simt_canonical
   simt_canonical.cu
diff --git a/examples/20_simt_canonical/simt_canonical.cu b/examples/20_simt_canonical/simt_canonical.cu
index c58c6d88..d905d4da 100644
--- a/examples/20_simt_canonical/simt_canonical.cu
+++ b/examples/20_simt_canonical/simt_canonical.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/examples/21_quaternion_gemm/CMakeLists.txt b/examples/21_quaternion_gemm/CMakeLists.txt
index 3fbf22e0..14d71c07 100644
--- a/examples/21_quaternion_gemm/CMakeLists.txt
+++ b/examples/21_quaternion_gemm/CMakeLists.txt
@@ -1,25 +1,33 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+
 cutlass_example_add_executable(
   21_quaternion_gemm
   quaternion_gemm.cu
diff --git a/examples/21_quaternion_gemm/quaternion_gemm.cu b/examples/21_quaternion_gemm/quaternion_gemm.cu
index 720b7f6a..95c7b4d7 100644
--- a/examples/21_quaternion_gemm/quaternion_gemm.cu
+++ b/examples/21_quaternion_gemm/quaternion_gemm.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -124,19 +130,19 @@ struct Options {
       << "  This example uses the CUTLASS Library to execute Quaternion GEMM computations.\n\n"
       << "Options:\n\n"
       << "  --help                      If specified, displays this usage statement.\n\n"
-      << "  --m <int>                   GEMM M dimension\n"
-      << "  --n <int>                   GEMM N dimension\n"
-      << "  --k <int>                   GEMM K dimension\n"
-      << "  --batch <int>               Number of GEMM operations executed in one batch\n"
-      << "  --alpha <f32>               Epilogue scalar alpha (real part)\n"
-      << "  --alpha_i <f32>             Epilogue scalar alpha_i (imaginary part)\n"
-      << "  --alpha_j <f32>             Epilogue scalar alpha_j (imaginary part)\n"
-      << "  --alpha_k <f32>             Epilogue scalar alpha_k (imaginary part)\n"
-      << "  --beta <f32>                Epilogue scalar beta (real part)\n\n"
-      << "  --beta_i <f32>              Epilogue scalar beta_i (imaginary part)\n\n"
-      << "  --beta_j <f32>              Epilogue scalar beta_j (imaginary part)\n\n"
-      << "  --beta_k <f32>              Epilogue scalar beta_k (imaginary part)\n\n"
-      << "  --iterations <int>          Number of profiling iterations to perform.\n\n";
+      << "  --m=<int>                   GEMM M dimension\n"
+      << "  --n=<int>                   GEMM N dimension\n"
+      << "  --k=<int>                   GEMM K dimension\n"
+      << "  --batch=<int>               Number of GEMM operations executed in one batch\n"
+      << "  --alpha=<f32>               Epilogue scalar alpha (real part)\n"
+      << "  --alpha_i=<f32>             Epilogue scalar alpha_i (imaginary part)\n"
+      << "  --alpha_j=<f32>             Epilogue scalar alpha_j (imaginary part)\n"
+      << "  --alpha_k=<f32>             Epilogue scalar alpha_k (imaginary part)\n"
+      << "  --beta=<f32>                Epilogue scalar beta (real part)\n\n"
+      << "  --beta_i=<f32>              Epilogue scalar beta_i (imaginary part)\n\n"
+      << "  --beta_j=<f32>              Epilogue scalar beta_j (imaginary part)\n\n"
+      << "  --beta_k=<f32>              Epilogue scalar beta_k (imaginary part)\n\n"
+      << "  --iterations=<int>          Number of profiling iterations to perform.\n\n";
 
     out << "\n\nExamples:\n\n"
       << "$ ./examples/21_quaternion_gemm/21_quaternion_gemm  --batch=7 --m=1024 --n=512 --k=1024 \\\n"
diff --git a/examples/22_quaternion_conv/CMakeLists.txt b/examples/22_quaternion_conv/CMakeLists.txt
index a6cbee70..9bfad4ff 100644
--- a/examples/22_quaternion_conv/CMakeLists.txt
+++ b/examples/22_quaternion_conv/CMakeLists.txt
@@ -1,26 +1,34 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
+
 cutlass_example_add_executable(
   22_quaternion_conv
   quaternion_conv.cu 
diff --git a/examples/22_quaternion_conv/quaternion_conv.cu b/examples/22_quaternion_conv/quaternion_conv.cu
index a790b69c..cd2a48d9 100644
--- a/examples/22_quaternion_conv/quaternion_conv.cu
+++ b/examples/22_quaternion_conv/quaternion_conv.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -249,21 +255,21 @@ struct Options {
       << "  forward convolution on tensors of layout NHWC.\n\n"
       << "Options:\n\n"
       << "  --help               If specified, displays this usage statement.\n\n"
-      << "  --n <int>            Input tensor extent N\n"
-      << "  --h <int>            Input tensor extent H\n"
-      << "  --w <int>            Input tensor extent W\n"
-      << "  --c <int>            Input tensor extent C\n"
-      << "  --k <int>            Filter extent K\n"
-      << "  --r <int>            Filter extent R\n"
-      << "  --s <int>            Filter extent S\n\n"
-      << "  --alpha <float>      Epilogue scalar alpha\n"
-      << "  --beta <float>       Epilogue scalar beta\n\n"
+      << "  --n=<int>            Input tensor extent N\n"
+      << "  --h=<int>            Input tensor extent H\n"
+      << "  --w=<int>            Input tensor extent W\n"
+      << "  --c=<int>            Input tensor extent C\n"
+      << "  --k=<int>            Filter extent K\n"
+      << "  --r=<int>            Filter extent R\n"
+      << "  --s=<int>            Filter extent S\n\n"
+      << "  --alpha=<float>      Epilogue scalar alpha\n"
+      << "  --beta=<float>       Epilogue scalar beta\n\n"
       << "  --ref-check          If set (true), reference check on the host is computed\n"
       << "  --perf-check         If set (true), performance is measured.\n"
       << "  --benchmark          If set (true), performance benchmarking on several layers and batch-size.\n"
-      << "  --iterations <int>   Number of profiling iterations to perform.\n"
+      << "  --iterations=<int>   Number of profiling iterations to perform.\n"
       << "  --save-workspace     If set, workspace is written to a text file.\n"
-      << "  --tag <string>       String to replicate across the first column in the results table\n";
+      << "  --tag=<string>       String to replicate across the first column in the results table\n";
 
     out << "\n\nExamples:\n\n"
       << "$ ./examples/22_quaternion_conv/22_quaternion_conv --n=32 --h=224 --w=224 --c=128 --k=256 --r=1 --s=1\n\n"
diff --git a/examples/23_ampere_gemm_operand_reduction_fusion/CMakeLists.txt b/examples/23_ampere_gemm_operand_reduction_fusion/CMakeLists.txt
index a7103fa4..49d313f4 100644
--- a/examples/23_ampere_gemm_operand_reduction_fusion/CMakeLists.txt
+++ b/examples/23_ampere_gemm_operand_reduction_fusion/CMakeLists.txt
@@ -1,26 +1,34 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
+
 cutlass_example_add_executable(
   23_ampere_gemm_operand_reduction_fusion
   ampere_gemm_operand_reduction_fusion.cu
diff --git a/examples/23_ampere_gemm_operand_reduction_fusion/ampere_gemm_operand_reduction_fusion.cu b/examples/23_ampere_gemm_operand_reduction_fusion/ampere_gemm_operand_reduction_fusion.cu
index a31d9838..bb880f4f 100644
--- a/examples/23_ampere_gemm_operand_reduction_fusion/ampere_gemm_operand_reduction_fusion.cu
+++ b/examples/23_ampere_gemm_operand_reduction_fusion/ampere_gemm_operand_reduction_fusion.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -267,22 +273,22 @@ struct Options {
     out << "28_ampere_gemm_bias_fusion example\n\n"
       << "Options:\n\n"
       << "  --help               If specified, displays this usage statement.\n\n"
-      << "  --m <int>            GEMM M\n"
-      << "  --n <int>            GEMM N\n"
-      << "  --k <int>            GEMM K\n"
-      << "  --split-k-slices <int> Split K Slices\n"
-      << "  --alpha <float>      Epilogue scalar alpha\n"
-      << "  --beta <float>       Epilogue scalar beta\n\n"
+      << "  --m=<int>            GEMM M\n"
+      << "  --n=<int>            GEMM N\n"
+      << "  --k=<int>            GEMM K\n"
+      << "  --split-k-slices=<int> Split K Slices\n"
+      << "  --alpha=<float>      Epilogue scalar alpha\n"
+      << "  --beta=<float>       Epilogue scalar beta\n\n"
       << "  --parallel-split-k   If set (true), use parallel split K\n"
       << "  --ref-check          If set (true), reference check on the host is computed\n"
       << "  --perf-check         If set (true), performance is measured.\n"
       << "  --benchmark          If set (true), performance benchmarking on several problem sizes.\n"
-      << "  --iterations <int>   Number of profiling iterations to perform.\n"
+      << "  --iterations=<int>   Number of profiling iterations to perform.\n"
       << "  --save-workspace     If set, workspace is written to a text file.\n"
-      << "  --tag <string>       String to replicate across the first column in the results table\n";
+      << "  --tag=<string>       String to replicate across the first column in the results table\n";
 
     out << "\n\nExamples:\n\n"
-      << "$ ./examples/23_ampere_gemm_operand_reduction_fusion/23_ampere_gemm_operand_reduction_fusion --m=1024 --n=1024 --k=1024 \n\n";
+      << "$ ./examples/23_ampere_gemm_bias_fusion_example/ampere_gemm_bias_fusion  --m=1024 --n=1024 --k=1024 \n\n";
 
     return out;
   }
@@ -418,9 +424,9 @@ Result profile(Options const &options) {
     {alpha, beta},
     tensor_a.device_ref().data(),              // <- reference to tensor A on device
     tensor_b.device_ref().data(),              // <- reference to tensor B on device
-    tensor_c.device_ref().data(),  // <- reference to matrix C on device
-    tensor_d.device_ref().data(),  // <- reference to matrix D on device
-    tensor_reduction.device_ref().data(),              // <- reference to reduction tensor on device
+    tensor_c.device_ref().data(),              // <- reference to matrix C on device
+    tensor_d.device_ref().data(),              // <- reference to matrix D on device
+    tensor_reduction.device_ref().data(),      // <- reference to reduction tensor on device
     options.problem_size.m() * options.problem_size.k(),
     options.problem_size.n() * options.problem_size.k(),
     options.problem_size.m() * options.problem_size.n(),
diff --git a/examples/24_gemm_grouped/CMakeLists.txt b/examples/24_gemm_grouped/CMakeLists.txt
index fd5f4073..c9f3558e 100644
--- a/examples/24_gemm_grouped/CMakeLists.txt
+++ b/examples/24_gemm_grouped/CMakeLists.txt
@@ -1,26 +1,34 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
+
 cutlass_example_add_executable(
   24_gemm_grouped
   gemm_grouped.cu 
diff --git a/examples/24_gemm_grouped/gemm_grouped.cu b/examples/24_gemm_grouped/gemm_grouped.cu
index 8a518b1b..cfeb1ba5 100644
--- a/examples/24_gemm_grouped/gemm_grouped.cu
+++ b/examples/24_gemm_grouped/gemm_grouped.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -44,13 +50,13 @@
     Examples:
 
       # Runs a grouped GEMM with 100 random problem sizes
-      $ ./examples/24_gemm_grouped/24_gemm_grouped --groups 100
+      $ ./examples/24_gemm_grouped/24_gemm_grouped --groups=100
 
       # Runs a grouped GEMM with 100 random problem sizes (with GEMM-K dimension equal to 1024)
-      $ ./examples/24_gemm_grouped/24_gemm_grouped --groups 100 --k=1024 --verbose=true
+      $ ./examples/24_gemm_grouped/24_gemm_grouped --groups=100 --k=1024 --verbose=true
 
       # Runs a grouped GEMM that is equivalent to a batched GEMM
-      $ ./examples/24_gemm_grouped/24_gemm_grouped --groups 100 --m=2048 --n=1024 --k=1024 --verbose=true
+      $ ./examples/24_gemm_grouped/24_gemm_grouped --groups=100 --m=2048 --n=1024 --k=1024 --verbose=true
 
       # Execute Grouped GEMM and profile with NSight
       $ nv-nsight-cu-cli ./examples/24_gemm_grouped/24_gemm_grouped --m=256 --n=256 --k=256 --verbose=true \
@@ -64,6 +70,7 @@
 #include <fstream>
 #include <sstream>
 #include <vector>
+#include <map>
 #include <unordered_map>
 
 #include "cutlass/cutlass.h"
@@ -125,17 +132,28 @@ struct HashGemmCoord {
 struct Options {
 
   bool help;
+  bool error;
   bool reference_check;
 
   std::vector<cutlass::gemm::GemmCoord> problem_sizes;
-  std::unordered_map<cutlass::gemm::GemmCoord, std::vector<int32_t>, HashGemmCoord> problem_bins;
 
+  // problem size bins
+  std::unordered_map<
+    cutlass::gemm::GemmCoord,
+    std::vector<int32_t>,
+    HashGemmCoord> problem_bins;
+
+  int alignment;
   int problem_count;
   int iterations;
   int cuda_streams;
   bool verbose;
   float alpha;
   float beta;
+  std::string benchmark_path;
+
+  std::string   output_tag;
+  std::ofstream output_file;
 
   //
   // Methods
@@ -143,6 +161,8 @@ struct Options {
 
   Options():
     help(false),
+    error(false),
+    alignment(8),
     reference_check(true),
     problem_count(15),
     iterations(20),
@@ -158,8 +178,10 @@ struct Options {
 
     if (cmd.check_cmd_line_flag("help")) {
       help = true;
+      return;
     }
 
+    cmd.get_cmd_line_argument("alignment", alignment, 8);
     cmd.get_cmd_line_argument("groups", problem_count, 15);
     cmd.get_cmd_line_argument("alpha", alpha, 1.0f);
     cmd.get_cmd_line_argument("beta", beta, 0.0f);    
@@ -167,6 +189,51 @@ struct Options {
     cmd.get_cmd_line_argument("streams", cuda_streams, 0);
     cmd.get_cmd_line_argument("verbose", verbose, false);
     cmd.get_cmd_line_argument("reference-check", reference_check, true);
+    cmd.get_cmd_line_argument("benchmark", benchmark_path);
+
+    std::string output_path;
+    cmd.get_cmd_line_argument("tag", output_tag);
+    cmd.get_cmd_line_argument("output_file", output_path);
+
+    if (!output_path.empty()) {
+
+      std::ios_base::openmode open_mode = std::ios_base::out;
+
+      std::ifstream input_file(output_path.c_str());
+
+      if (input_file.good()) {
+        open_mode = std::ios_base::app;
+        input_file.close();
+      }
+
+      output_file.open(output_path.c_str(), open_mode);
+
+      if (output_file.good() && open_mode != std::ios_base::app) {
+        output_file << "Tag,Provider,Kind,Groups,Runtime,GFLOPs\n";
+      }
+    }
+
+    // Decide how to initialize the problems
+    if (!benchmark_path.empty()) {
+      if (!benchmark_problems()) {
+        error = true;
+        problem_sizes.clear();
+        return;
+      }
+    }
+    else {
+      randomize_problems(cmd);
+    }
+
+    // Post-process the problem sizes
+    bin_problems();
+  }
+
+  void randomize_problems(cutlass::CommandLine &cmd) {
+
+    //
+    // For now, randomly choose the problem sizes.
+    //
 
     int cmd_line_m = -1;
     int cmd_line_n = -1;
@@ -176,10 +243,6 @@ struct Options {
     cmd.get_cmd_line_argument("n", cmd_line_n);
     cmd.get_cmd_line_argument("k", cmd_line_k);
 
-    //
-    // For now, randomly choose the problem sizes.
-    //
-
     problem_sizes.reserve(problem_count);
 
     for (int i = 0; i < problem_count; ++i) {
@@ -189,27 +252,75 @@ struct Options {
       int k = cmd_line_k;
 
       if (m < 1) {
-        m = 8 * (rand() % 256) + 8;
+        m = alignment * ((rand() % 256) + 1);
       }
 
       if (n < 1) {
-        n = 8 * (rand() % 256) + 8;
+        n = alignment * ((rand() % 256) + 1);
       }
 
       if (k < 1) {
-        k = 8 * (rand() % 256) + 8;
+        k = alignment * ((rand() % 256) + 1);
       }
 
       cutlass::gemm::GemmCoord problem(m, n, k);
 
       problem_sizes.push_back(problem);
     }
+  }
+
+  /// Load a benchmark
+  bool benchmark_problems() {
+    std::ifstream file(benchmark_path);
+    if (!file.good()) {
+      return false;
+    }
+
+    while (file.good()) {
+
+      int idx = -1;
+      std::string extent_str;
+
+      file >> idx >> extent_str;
+
+      if (idx < 0 || extent_str.empty()) {
+        break;
+      }
+
+      cutlass::gemm::GemmCoord extent;
+      std::vector<std::string> tokens;
+
+      cutlass::CommandLine::tokenize(tokens, extent_str, 'x');
+
+      for (int i = 0; i < int(tokens.size()); ++i) {
+        int x = std::atoi(tokens.at(i).c_str());
+
+        // round up
+        if (x % alignment) {
+          x += (alignment - (x % alignment));
+        }
+
+        extent.at(i) = x;
+      }
+
+      if (extent.product()) {
+        problem_sizes.push_back(extent);
+      }
+    }
+
+    return true;
+  }
+
+  /// Post processes the problems
+  void bin_problems() {
+
+    problem_count = int(problem_sizes.size());
 
     //
     // Insert the problem sizes into a sorted container class. This is *NOT* necessary
     // to run the CUTLASS kernel, but it enables the execution of cublas's batched GEMM.
     //
-    for (int i = 0; i < problem_count; ++i) {
+    for (int i = 0; i < int(problem_sizes.size()); ++i) {
       auto it = problem_bins.find(problem_sizes.at(i));
       if (it == problem_bins.end()) {
         problem_bins.insert({problem_sizes.at(i), std::vector<int32_t>({i}) });
@@ -230,6 +341,9 @@ struct Options {
       << "  in device Global Memory and loaded by the kernel.\n\n"
       << "Options:\n\n"
       << "  --help                      If specified, displays this usage statement.\n\n"
+      << "  --benchmark=<str>           Executes a benchmark problem size.\n"
+      << "  --output_file=<str>         Path to a CSV file to output results. If it exists already, results are appended.\n"
+      << "  --tag=<str>                 String tag to prepend to the CSV file.\n"
       << "  --groups=<int>              Number of individual GEMM problems (default: --groups=15)\n"
       << "  --m=<int>                   Sets the M dimension for all groups. Otherwise, it is selected randomly\n"
       << "  --n=<int>                   Sets the N dimension for all groups. Otherwise, it is selected randomly\n"
@@ -243,13 +357,24 @@ struct Options {
     out << "\n\nExamples:\n\n"
 
       << "# Runs a grouped GEMM with 100 random problem sizes\n"
-      << "$ ./examples/24_gemm_grouped/24_gemm_grouped --groups 100\n\n"
+      << "$ ./examples/24_gemm_grouped/24_gemm_grouped --groups=100\n\n"
 
       << "# Runs a grouped GEMM with 100 random problem sizes (with GEMM-K dimension equal to 1024)\n"
-      << "$ ./examples/24_gemm_grouped/24_gemm_grouped --groups 100 --k=1024 --verbose=true\n\n"
+      << "$ ./examples/24_gemm_grouped/24_gemm_grouped --groups=100 --k=1024 --verbose=true\n\n"
 
       << "# Runs a grouped GEMM that is equivalent to a batched GEMM\n"
-      << "$ ./examples/24_gemm_grouped/24_gemm_grouped --groups 100 --m=2048 --n=1024 --k=1024 --verbose=true\n\n"
+      << "$ ./examples/24_gemm_grouped/24_gemm_grouped --groups=100 --m=2048 --n=1024 --k=1024 --verbose=true\n\n"
+
+      << "# Runs a grouped GEMM problem given an externally supplied benchmark file. This is a text file in which\n"
+      << "# Each line contains a unique group index and an MxNxK triple indicating problemsize.\n"
+      << "#\n"
+      << "# For example, assume the following are the contents of 'problems.txt'\n"
+      << "#\n"
+      << "# 0 1024x256x520\n"
+      << "# 1 520x264x1024\n"
+      << "# 2 96x48x1024\n"
+      << "#\n"
+      << "$ ./examples/24_gemm_grouped/24_gemm_grouped --benchmark=problems.txt\n\n"
 
       << "# Execute Grouped GEMM and profile with NSight\n"
       << "$ nv-nsight-cu-cli ./examples/24_gemm_grouped/24_gemm_grouped --m=256 --n=256 --k=256 --verbose=true --iterations=1 --reference-check=false\n\n";
@@ -302,7 +427,7 @@ private:
   // Data members
   //
 
-  Options options;
+  Options & options;
 
   /// Initialization
   cutlass::Distribution::Kind init_A;
@@ -344,7 +469,7 @@ public:
   //
 
   TestbedGrouped(
-    Options const &options_,
+    Options &options_,
     cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
     cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
     cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
@@ -833,6 +958,11 @@ public:
     std::cout << "    " << "Grouped Runtime: " << result.runtime_ms << " ms" << std::endl;
     std::cout << "    " << "Grouped  GFLOPs: " << result.gflops << std::endl;
 
+    if (options.output_file.good()) {
+      options.output_file << options.output_tag << ",CUTLASS,grouped,"
+        << problem_count() << "," << result.runtime_ms << "," << result.gflops << std::endl;
+    }
+
     return result;
   }
 
@@ -923,6 +1053,7 @@ public:
     // Create CUDA streams to maximize concurrency of batched-array GEMM kernels
     //
     std::vector<cudaStream_t>   cuda_streams;
+    char const *provider = "CUTLASS";
 
     //
     // Warmup run
@@ -1189,6 +1320,11 @@ public:
     std::cout << "    " << "Batched Runtime: " << result.runtime_ms << " ms" << std::endl;
     std::cout << "    " << "Batched  GFLOPs: " << result.gflops << std::endl;
 
+    if (options.output_file.good()) {
+      options.output_file << options.output_tag << "," << provider << ",batched,"
+        << problem_count() << "," << result.runtime_ms << "," << result.gflops << std::endl;
+    }
+
     result.passed = true;
     return result;
   }
@@ -1236,6 +1372,11 @@ int main(int argc, char const **args) {
     return 0;
   }
 
+  if (options.error) {
+    std::cerr << "Aborting execution." << std::endl;
+    return -1;
+  }
+
   //
   // Define the Grouped GEMM type
   //
@@ -1243,21 +1384,25 @@ int main(int argc, char const **args) {
   using ElementOutput = cutlass::half_t;
   using ElementAccumulator = float;
 
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
   using GemmKernel = typename cutlass::gemm::kernel::DefaultGemmGrouped<
     cutlass::half_t, 
-    cutlass::layout::ColumnMajor, 
+    LayoutA,
     cutlass::ComplexTransform::kNone,
     8,
     cutlass::half_t,
-    cutlass::layout::ColumnMajor, 
+    LayoutB,
     cutlass::ComplexTransform::kNone,
     8,
-    ElementOutput, cutlass::layout::ColumnMajor,
+    ElementOutput, LayoutC,
     ElementAccumulator, 
     cutlass::arch::OpClassTensorOp, 
     cutlass::arch::Sm80,
     cutlass::gemm::GemmShape<128, 128, 32>,
-    cutlass::gemm::GemmShape<64, 64, 32>, 
+    cutlass::gemm::GemmShape<64, 64, 32>,
     cutlass::gemm::GemmShape<16, 8, 16>,
     cutlass::epilogue::thread::LinearCombination<
         ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
@@ -1273,9 +1418,9 @@ int main(int argc, char const **args) {
 
   // Gemm operator cutlass_tensorop_f16_s16816gemm_f16_128x128_32x4_nt_align8
   using GemmBatched = cutlass::gemm::device::GemmUniversal<
-    ElementOutput, cutlass::layout::ColumnMajor,
-    ElementOutput, cutlass::layout::ColumnMajor,
-    ElementOutput, cutlass::layout::ColumnMajor,
+    cutlass::half_t, LayoutA,
+    cutlass::half_t, LayoutB,
+    ElementOutput,   LayoutC,
     ElementAccumulator,
     cutlass::arch::OpClassTensorOp,
     cutlass::arch::Sm80,
diff --git a/examples/25_ampere_fprop_mainloop_fusion/CMakeLists.txt b/examples/25_ampere_fprop_mainloop_fusion/CMakeLists.txt
index 6334a08b..0bf0c775 100644
--- a/examples/25_ampere_fprop_mainloop_fusion/CMakeLists.txt
+++ b/examples/25_ampere_fprop_mainloop_fusion/CMakeLists.txt
@@ -1,26 +1,34 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
+
 cutlass_example_add_executable(
   25_ampere_fprop_mainloop_fusion
   ampere_fprop_mainloop_fusion.cu
diff --git a/examples/25_ampere_fprop_mainloop_fusion/ampere_fprop_mainloop_fusion.cu b/examples/25_ampere_fprop_mainloop_fusion/ampere_fprop_mainloop_fusion.cu
index aba02625..661efcf7 100644
--- a/examples/25_ampere_fprop_mainloop_fusion/ampere_fprop_mainloop_fusion.cu
+++ b/examples/25_ampere_fprop_mainloop_fusion/ampere_fprop_mainloop_fusion.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -268,21 +274,21 @@ struct Options {
       << "  forward convolution on tensors of layout NHWC.\n\n"
       << "Options:\n\n"
       << "  --help               If specified, displays this usage statement.\n\n"
-      << "  --n <int>            Input tensor extent N\n"
-      << "  --h <int>            Input tensor extent H\n"
-      << "  --w <int>            Input tensor extent W\n"
-      << "  --c <int>            Input tensor extent C\n"
-      << "  --k <int>            Filter extent K\n"
-      << "  --r <int>            Filter extent R\n"
-      << "  --s <int>            Filter extent S\n\n"
-      << "  --alpha <float>      Epilogue scalar alpha\n"
-      << "  --beta <float>       Epilogue scalar beta\n\n"
+      << "  --n=<int>            Input tensor extent N\n"
+      << "  --h=<int>            Input tensor extent H\n"
+      << "  --w=<int>            Input tensor extent W\n"
+      << "  --c=<int>            Input tensor extent C\n"
+      << "  --k=<int>            Filter extent K\n"
+      << "  --r=<int>            Filter extent R\n"
+      << "  --s=<int>            Filter extent S\n\n"
+      << "  --alpha=<float>      Epilogue scalar alpha\n"
+      << "  --beta=<float>       Epilogue scalar beta\n\n"
       << "  --ref-check          If set (true), reference check on the host is computed\n"
       << "  --perf-check         If set (true), performance is measured.\n"
       << "  --benchmark          If set (true), performance benchmarking on several layers and batch-size.\n"
-      << "  --iterations <int>   Number of profiling iterations to perform.\n"
+      << "  --iterations=<int>   Number of profiling iterations to perform.\n"
       << "  --save-workspace     If set, workspace is written to a text file.\n"
-      << "  --tag <string>       String to replicate across the first column in the results table\n";
+      << "  --tag=<string>       String to replicate across the first column in the results table\n";
 
     out << "\n\nExamples:\n\n"
       << "$ ./examples/25_ampere_fprop_mainloop_fusion/25_ampere_fprop_mainloop_fusion  --n=32 --h=224 --w=224 --c=128 --k=256 --r=1 --s=1\n\n"
@@ -381,7 +387,8 @@ Result profile_convolution(Options const &options) {
   cutlass::HostTensor<ElementInputScaleBias, LayoutInputScaleBias>
       tensor_a_bias({1, options.input_size.c()});
   cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_c(options.output_size());
-  cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_ref_c(options.output_size());
+  cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_d(options.output_size());
+  cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_ref_d(options.output_size());
 
   //
   // Initialize tensors
@@ -425,9 +432,13 @@ Result profile_convolution(Options const &options) {
   cutlass::reference::host::TensorFill(
       tensor_c.host_view());
 
-  // Fill tensor C for reference on host with zeros
+  // Fill tensor D on host with zeros
   cutlass::reference::host::TensorFill(
-      tensor_ref_c.host_view());
+      tensor_d.host_view());
+
+  // Fill tensor D for reference on host with zeros
+  cutlass::reference::host::TensorFill(
+      tensor_ref_d.host_view());
 
   // Copy data from host to GPU
   tensor_a.sync_device();
@@ -435,7 +446,8 @@ Result profile_convolution(Options const &options) {
   tensor_a_bias.sync_device();
   tensor_b.sync_device();
   tensor_c.sync_device();
-  tensor_ref_c.sync_device();
+  tensor_d.sync_device();
+  tensor_ref_d.sync_device();
 
   //
   // Define arguments for CUTLASS Convolution
@@ -465,7 +477,7 @@ Result profile_convolution(Options const &options) {
     tensor_a_scale.device_ref(),
     tensor_a_bias.device_ref(),
     tensor_c.device_ref(),
-    tensor_c.device_ref(),
+    tensor_d.device_ref(),
     {options.alpha, options.beta},
   };
 
@@ -532,18 +544,18 @@ Result profile_convolution(Options const &options) {
       tensor_transformed_a.device_ref(),
       tensor_b.device_ref(),
       tensor_c.device_ref(),
-      tensor_ref_c.device_ref(),
+      tensor_ref_d.device_ref(),
       options.alpha,
       options.beta
     );
 
     // Check if output from CUTLASS kernel and reference kernel are equal or not
-    tensor_c.sync_host();
-    tensor_ref_c.sync_host();
+    tensor_d.sync_host();
+    tensor_ref_d.sync_host();
 
     bool passed = cutlass::reference::host::TensorEquals(
-      tensor_c.host_view(),
-      tensor_ref_c.host_view());
+      tensor_d.host_view(),
+      tensor_ref_d.host_view());
 
     if (!passed) {
       result.reference_check = cutlass::Status::kErrorInternal;
@@ -562,7 +574,7 @@ Result profile_convolution(Options const &options) {
 
     std::stringstream ss;
 
-    ss << "18_ampere_fused_fprop_batch_normalization_"
+    ss << "25_ampere_fprop_mainloop_fusion_"
       << options.input_size.n() << "x" << options.input_size.h() << "x" << options.input_size.w() << "x" << options.input_size.c() 
       << "_"
       << options.filter_size.n() << "x" << options.filter_size.h() << "x" << options.filter_size.w() << "x" << options.filter_size.c() 
@@ -575,10 +587,10 @@ Result profile_convolution(Options const &options) {
       << "Filters = \n" << tensor_b.host_view() << "\n\n";
 
     if (options.reference_check) {
-      output_workspace << "Reference = \n" << tensor_ref_c.host_view() << "\n\n";
+      output_workspace << "Reference = \n" << tensor_ref_d.host_view() << "\n\n";
     }
 
-    output_workspace << "Computed = \n" << tensor_c.host_view() << std::endl;
+    output_workspace << "Computed = \n" << tensor_d.host_view() << std::endl;
 
     std::cout << "Results written to '" << ss.str() << "'." << std::endl;
   }
diff --git a/examples/26_ampere_wgrad_mainloop_fusion/CMakeLists.txt b/examples/26_ampere_wgrad_mainloop_fusion/CMakeLists.txt
index d35277d0..f836f6e0 100644
--- a/examples/26_ampere_wgrad_mainloop_fusion/CMakeLists.txt
+++ b/examples/26_ampere_wgrad_mainloop_fusion/CMakeLists.txt
@@ -1,26 +1,34 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
+
 cutlass_example_add_executable(
   26_ampere_wgrad_mainloop_fusion
   ampere_wgrad_mainloop_fusion.cu
diff --git a/examples/26_ampere_wgrad_mainloop_fusion/ampere_wgrad_mainloop_fusion.cu b/examples/26_ampere_wgrad_mainloop_fusion/ampere_wgrad_mainloop_fusion.cu
index 001aa912..da3ec1ca 100644
--- a/examples/26_ampere_wgrad_mainloop_fusion/ampere_wgrad_mainloop_fusion.cu
+++ b/examples/26_ampere_wgrad_mainloop_fusion/ampere_wgrad_mainloop_fusion.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -265,21 +271,21 @@ struct Options {
       << "  backward convolution on tensors of layout NHWC.\n\n"
       << "Options:\n\n"
       << "  --help               If specified, displays this usage statement.\n\n"
-      << "  --n <int>            Input tensor extent N\n"
-      << "  --h <int>            Input tensor extent H\n"
-      << "  --w <int>            Input tensor extent W\n"
-      << "  --c <int>            Input tensor extent C\n"
-      << "  --k <int>            Filter extent K\n"
-      << "  --r <int>            Filter extent R\n"
-      << "  --s <int>            Filter extent S\n\n"
-      << "  --alpha <float>      Epilogue scalar alpha\n"
-      << "  --beta <float>       Epilogue scalar beta\n\n"
+      << "  --n=<int>            Input tensor extent N\n"
+      << "  --h=<int>            Input tensor extent H\n"
+      << "  --w=<int>            Input tensor extent W\n"
+      << "  --c=<int>            Input tensor extent C\n"
+      << "  --k=<int>            Filter extent K\n"
+      << "  --r=<int>            Filter extent R\n"
+      << "  --s=<int>            Filter extent S\n\n"
+      << "  --alpha=<float>      Epilogue scalar alpha\n"
+      << "  --beta=<float>       Epilogue scalar beta\n\n"
       << "  --ref-check          If set (true), reference check on the host is computed\n"
       << "  --perf-check         If set (true), performance is measured.\n"
       << "  --benchmark          If set (true), performance benchmarking on several layers and batch-size.\n"
-      << "  --iterations <int>   Number of profiling iterations to perform.\n"
+      << "  --iterations=<int>   Number of profiling iterations to perform.\n"
       << "  --save-workspace     If set, workspace is written to a text file.\n"
-      << "  --tag <string>       String to replicate across the first column in the results table\n";
+      << "  --tag=<string>       String to replicate across the first column in the results table\n";
 
     out << "\n\nExamples:\n\n"
       << "$ ./examples/26_ampere_fused_fprop_batch_normalization/26_ampere_fused_wgrad_batch_normalization  --n=32 --h=224 --w=224 --c=128 --k=256 --r=1 --s=1\n\n"
@@ -379,7 +385,8 @@ Result profile_convolution(Options const &options) {
       tensor_b_bias({1, options.input_size.c()});
 
   cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_c(options.filter_size);
-  cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_ref_c(options.filter_size);
+  cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_d(options.filter_size);
+  cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_ref_d(options.filter_size);
 
   //
   // Initialize tensors
@@ -423,9 +430,13 @@ Result profile_convolution(Options const &options) {
   cutlass::reference::host::TensorFill(
       tensor_c.host_view());
 
-  // Fill tensor C for reference on host with zeros
+  // Fill tensor D on host with zeros
   cutlass::reference::host::TensorFill(
-      tensor_ref_c.host_view());
+      tensor_d.host_view());
+
+  // Fill tensor D for reference on host with zeros
+  cutlass::reference::host::TensorFill(
+      tensor_ref_d.host_view());
 
   // Copy data from host to GPU
   tensor_a.sync_device();
@@ -433,7 +444,8 @@ Result profile_convolution(Options const &options) {
   tensor_b_scale.sync_device();
   tensor_b_bias.sync_device();
   tensor_c.sync_device();
-  tensor_ref_c.sync_device();
+  tensor_d.sync_device();
+  tensor_ref_d.sync_device();
 
   //
   // Define arguments for CUTLASS Convolution
@@ -463,7 +475,7 @@ Result profile_convolution(Options const &options) {
     tensor_b_scale.device_ref(),
     tensor_b_bias.device_ref(),
     tensor_c.device_ref(),
-    tensor_c.device_ref(),
+    tensor_d.device_ref(),
     {options.alpha, options.beta},
   };
 
@@ -530,18 +542,18 @@ Result profile_convolution(Options const &options) {
       tensor_a.device_ref(),
       tensor_transformed_b.device_ref(),
       tensor_c.device_ref(),
-      tensor_ref_c.device_ref(),
+      tensor_ref_d.device_ref(),
       options.alpha,
       options.beta
     );
 
     // Check if output from CUTLASS kernel and reference kernel are equal or not
-    tensor_c.sync_host();
-    tensor_ref_c.sync_host();
+    tensor_d.sync_host();
+    tensor_ref_d.sync_host();
 
     bool passed = cutlass::reference::host::TensorEquals(
-      tensor_c.host_view(),
-      tensor_ref_c.host_view());
+      tensor_d.host_view(),
+      tensor_ref_d.host_view());
 
     if (!passed) {
       result.reference_check = cutlass::Status::kErrorInternal;
@@ -560,7 +572,7 @@ Result profile_convolution(Options const &options) {
 
     std::stringstream ss;
 
-    ss << "26_ampere_fused_wgrad_batch_normalization_"
+    ss << "26_ampere_wgrad_mainloop_fusion_"
       << options.input_size.n() << "x" << options.input_size.h() << "x" << options.input_size.w() << "x" << options.input_size.c() 
       << "_"
       << options.filter_size.n() << "x" << options.filter_size.h() << "x" << options.filter_size.w() << "x" << options.filter_size.c() 
@@ -573,10 +585,10 @@ Result profile_convolution(Options const &options) {
       << "Filters = \n" << tensor_b.host_view() << "\n\n";
 
     if (options.reference_check) {
-      output_workspace << "Reference = \n" << tensor_ref_c.host_view() << "\n\n";
+      output_workspace << "Reference = \n" << tensor_ref_d.host_view() << "\n\n";
     }
 
-    output_workspace << "Computed = \n" << tensor_c.host_view() << std::endl;
+    output_workspace << "Computed = \n" << tensor_d.host_view() << std::endl;
 
     std::cout << "Results written to '" << ss.str() << "'." << std::endl;
   }
diff --git a/examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm/27_ampere_3xtf32_fast_accurate_tensorop_gemm.cu b/examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm/27_ampere_3xtf32_fast_accurate_tensorop_gemm.cu
index 52260e44..06559637 100644
--- a/examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm/27_ampere_3xtf32_fast_accurate_tensorop_gemm.cu
+++ b/examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm/27_ampere_3xtf32_fast_accurate_tensorop_gemm.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -201,14 +207,14 @@ struct Options {
       << "  This example uses the CUTLASS Library to emulate FP32 with TF32 tensorop GEMM computations.\n\n"
       << "Options:\n\n"
       << "  --help                      If specified, displays this usage statement.\n\n"
-      << "  --m <int>                   GEMM M dimension\n"
-      << "  --n <int>                   GEMM N dimension\n"
-      << "  --k <int>                   GEMM K dimension\n"
-      << "  --alpha <f32>               Epilogue scalar alpha\n"
-      << "  --beta <f32>                Epilogue scalar beta\n\n"
-      << "  --rand_mode <string>        gauss / uniform*\n\n"
-      << "  --seed <int>                Random number seed (1*)\n\n"
-      << "  --iterations <int>          Number of profiling iterations to perform.\n\n"
+      << "  --m=<int>                   GEMM M dimension\n"
+      << "  --n=<int>                   GEMM N dimension\n"
+      << "  --k=<int>                   GEMM K dimension\n"
+      << "  --alpha=<f32>               Epilogue scalar alpha\n"
+      << "  --beta=<f32>                Epilogue scalar beta\n\n"
+      << "  --rand_mode=<string>        gauss / uniform*\n\n"
+      << "  --seed=<int>                Random number seed (1*)\n\n"
+      << "  --iterations=<int>          Number of profiling iterations to perform.\n\n"
       << "  --benchmark                 If set (true), performance benchmarking on several layers and batch-size.\n\n";
 
     out << "\n\nExamples:\n\n"
diff --git a/examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm/CMakeLists.txt b/examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm/CMakeLists.txt
index fb2b4f91..c551b125 100644
--- a/examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm/CMakeLists.txt
+++ b/examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm/CMakeLists.txt
@@ -1,25 +1,33 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+
 cutlass_example_add_executable(
   27_ampere_3xtf32_fast_accurate_tensorop_gemm
   27_ampere_3xtf32_fast_accurate_tensorop_gemm.cu
diff --git a/examples/28_ampere_3xtf32_fast_accurate_tensorop_fprop/CMakeLists.txt b/examples/28_ampere_3xtf32_fast_accurate_tensorop_fprop/CMakeLists.txt
index 693e7c9b..04ac4bd1 100644
--- a/examples/28_ampere_3xtf32_fast_accurate_tensorop_fprop/CMakeLists.txt
+++ b/examples/28_ampere_3xtf32_fast_accurate_tensorop_fprop/CMakeLists.txt
@@ -1,26 +1,34 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
+
 cutlass_example_add_executable(
   28_ampere_3xtf32_fast_accurate_tensorop_fprop
   ampere_3xtf32_fast_accurate_tensorop_fprop.cu
diff --git a/examples/28_ampere_3xtf32_fast_accurate_tensorop_fprop/ampere_3xtf32_fast_accurate_tensorop_fprop.cu b/examples/28_ampere_3xtf32_fast_accurate_tensorop_fprop/ampere_3xtf32_fast_accurate_tensorop_fprop.cu
index 6208f1a7..b2996f2d 100644
--- a/examples/28_ampere_3xtf32_fast_accurate_tensorop_fprop/ampere_3xtf32_fast_accurate_tensorop_fprop.cu
+++ b/examples/28_ampere_3xtf32_fast_accurate_tensorop_fprop/ampere_3xtf32_fast_accurate_tensorop_fprop.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -258,19 +264,19 @@ struct Options {
       << "  forward convolution on tensors of layout NHWC.\n\n"
       << "Options:\n\n"
       << "  --help               If specified, displays this usage statement.\n\n"
-      << "  --n <int>            Input tensor extent N\n"
-      << "  --h <int>            Input tensor extent H\n"
-      << "  --w <int>            Input tensor extent W\n"
-      << "  --c <int>            Input tensor extent C\n"
-      << "  --k <int>            Filter extent K\n"
-      << "  --r <int>            Filter extent R\n"
-      << "  --s <int>            Filter extent S\n\n"
-      << "  --alpha <float>      Epilogue scalar alpha\n"
-      << "  --beta <float>       Epilogue scalar beta\n\n"
+      << "  --n=<int>            Input tensor extent N\n"
+      << "  --h=<int>            Input tensor extent H\n"
+      << "  --w=<int>            Input tensor extent W\n"
+      << "  --c=<int>            Input tensor extent C\n"
+      << "  --k=<int>            Filter extent K\n"
+      << "  --r=<int>            Filter extent R\n"
+      << "  --s=<int>            Filter extent S\n\n"
+      << "  --alpha=<float>      Epilogue scalar alpha\n"
+      << "  --beta=<float>       Epilogue scalar beta\n\n"
       << "  --benchmark          If set (true), performance benchmarking on several layers and batch-size.\n"
-      << "  --iterations <int>   Number of profiling iterations to perform.\n"
+      << "  --iterations=<int>   Number of profiling iterations to perform.\n"
       << "  --save-workspace     If set, workspace is written to a text file.\n"
-      << "  --tag <string>       String to replicate across the first column in the results table\n";
+      << "  --tag=<string>       String to replicate across the first column in the results table\n";
 
     out << "\n\nExamples:\n\n"
       << "$ ./examples/28_ampere_3xtf32_fast_accurate_tensorop_fprop/28_ampere_3xtf32_fast_accurate_tensorop_fprop  --n=32 --h=224 --w=224 --c=128 --k=256 --r=1 --s=1\n\n"
diff --git a/examples/29_ampere_3xtf32_fast_accurate_tensorop_complex_gemm/29_ampere_3xtf32_fast_accurate_tensorop_complex_gemm.cu b/examples/29_ampere_3xtf32_fast_accurate_tensorop_complex_gemm/29_ampere_3xtf32_fast_accurate_tensorop_complex_gemm.cu
index 08e85c61..6f89d64f 100644
--- a/examples/29_ampere_3xtf32_fast_accurate_tensorop_complex_gemm/29_ampere_3xtf32_fast_accurate_tensorop_complex_gemm.cu
+++ b/examples/29_ampere_3xtf32_fast_accurate_tensorop_complex_gemm/29_ampere_3xtf32_fast_accurate_tensorop_complex_gemm.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -164,14 +170,14 @@ struct Options {
       << "  This example uses the CUTLASS Library to emulate FP32 complex GEMM computations with TF32 tensor cores.\n\n"
       << "Options:\n\n"
       << "  --help                      If specified, displays this usage statement.\n\n"
-      << "  --m <int>                   GEMM M dimension\n"
-      << "  --n <int>                   GEMM N dimension\n"
-      << "  --k <int>                   GEMM K dimension\n"
-      << "  --alpha <f32>               Epilogue scalar alpha\n"
-      << "  --beta <f32>                Epilogue scalar beta\n\n"
-      << "  --rand_mode <string>        gauss / uniform*\n\n"
-      << "  --seed <int>                Random number seed (1*)\n\n"
-      << "  --iterations <int>          Number of profiling iterations to perform.\n\n"
+      << "  --m=<int>                   GEMM M dimension\n"
+      << "  --n=<int>                   GEMM N dimension\n"
+      << "  --k=<int>                   GEMM K dimension\n"
+      << "  --alpha=<f32>               Epilogue scalar alpha\n"
+      << "  --beta=<f32>                Epilogue scalar beta\n\n"
+      << "  --rand_mode=<string>        gauss / uniform*\n\n"
+      << "  --seed=<int>                Random number seed (1*)\n\n"
+      << "  --iterations=<int>          Number of profiling iterations to perform.\n\n"
       << "  --benchmark                 If set (true), performance benchmarking on several layers and batch-size.\n\n";
 
     out << "\n\nExamples:\n\n"
diff --git a/examples/29_ampere_3xtf32_fast_accurate_tensorop_complex_gemm/CMakeLists.txt b/examples/29_ampere_3xtf32_fast_accurate_tensorop_complex_gemm/CMakeLists.txt
index d7b59a39..c7e896ba 100644
--- a/examples/29_ampere_3xtf32_fast_accurate_tensorop_complex_gemm/CMakeLists.txt
+++ b/examples/29_ampere_3xtf32_fast_accurate_tensorop_complex_gemm/CMakeLists.txt
@@ -1,25 +1,33 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+
 cutlass_example_add_executable(
   29_ampere_3xtf32_fast_accurate_tensorop_complex_gemm
   29_ampere_3xtf32_fast_accurate_tensorop_complex_gemm.cu
diff --git a/examples/30_wgrad_split_k/30_wgrad_split_k.cu b/examples/30_wgrad_split_k/30_wgrad_split_k.cu
index 7a1d2bea..b49446cc 100644
--- a/examples/30_wgrad_split_k/30_wgrad_split_k.cu
+++ b/examples/30_wgrad_split_k/30_wgrad_split_k.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2022, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -29,7 +35,6 @@ impligit GEMM, corresponding to the sequential reduction loop, is very large (N
 reduction is highly effective for such cases. Given split_k_slices parameter, it partitions the K loop into
 split_k_slices chunks and computes partial reductions in parallel across different blocks. After that,
 a parallel reduction kernel is launched to accumulate partial reductions.
-
 In practice, wgrad requires fp32 accumulation to avoid overflow. When the input is fp16, some care is needed
 to correctly instantiate the GEMM template.
 */
@@ -112,7 +117,7 @@ using EpilogueOpGEMM = cutlass::epilogue::thread::LinearCombination<
 // The epilogue functor for reduction. This is the one that is actually used.
 using EpilogueOpReduction = cutlass::epilogue::thread::LinearCombination<
     ElementOutput,                                     // Data type of output matrix.
-    128 / cutlass::sizeof_bits<ElementAccumulator>::value,  // The number of elements per vectorized.
+    128 / cutlass::sizeof_bits<ElementOutput>::value,  // The number of elements per vectorized.
     // memory access. This becomes the vector width of
     // math instructions in the epilogue too.
     ElementAccumulator,                                // Data type of accumulator
@@ -294,22 +299,22 @@ struct Options {
 	<< "  Split-k with parallel reduction is highly effective for such cases.\n\n"
 	<< "Options:\n\n"
 	<< "  --help               If specified, displays this usage statement.\n\n"
-	<< "  --n <int>            Input tensor extent N\n"
-	<< "  --h <int>            Input tensor extent H\n"
-	<< "  --w <int>            Input tensor extent W\n"
-	<< "  --c <int>            Input tensor extent C\n"
-	<< "  --k <int>            Filter extent K\n"
-	<< "  --r <int>            Filter extent R\n"
-	<< "  --s <int>            Filter extent S\n\n"
-	<< "  --alpha <float>      Epilogue scalar alpha\n"
-	<< "  --beta <float>       Epilogue scalar beta\n\n"
-	<< "  --split-k-slices <int>   Split-k factor \n\n"
+	<< "  --n=<int>            Input tensor extent N\n"
+	<< "  --h=<int>            Input tensor extent H\n"
+	<< "  --w=<int>            Input tensor extent W\n"
+	<< "  --c=<int>            Input tensor extent C\n"
+	<< "  --k=<int>            Filter extent K\n"
+	<< "  --r=<int>            Filter extent R\n"
+	<< "  --s=<int>            Filter extent S\n\n"
+	<< "  --alpha=<float>      Epilogue scalar alpha\n"
+	<< "  --beta=<float>       Epilogue scalar beta\n\n"
+	<< "  --split-k-slices=<int>   Split-k factor \n\n"
 	<< "  --ref-check          If set (true), reference check on the host is computed\n"
 	<< "  --perf-check         If set (true), performance is measured.\n"
 	<< "  --benchmark          If set (true), performance benchmarking on several layers and batch-size.\n"
-	<< "  --iterations <int>   Number of profiling iterations to perform.\n"
+	<< "  --iterations=<int>   Number of profiling iterations to perform.\n"
 	<< "  --save-workspace     If set, workspace is written to a text file.\n"
-	<< "  --tag <string>       String to replicate across the first column in the results table\n";
+	<< "  --tag=<string>       String to replicate across the first column in the results table\n";
 
     out << "\n\nExamples:\n\n"
 	<< "$ ./examples/30_wgrad_split_k/30_wgrad_split_k --n=32 --h=224 --w=224 --c=128 --k=256 --r=3 --s=3 --split-k-slices=8\n\n";
@@ -499,7 +504,7 @@ Result profile_convolution(Options const &options) {
   result.status = implicit_gemm.can_implement(arguments);
   CUTLASS_CHECK(result.status);
 
-  // After the workspace is allocated, we point the Implicit GEMM destination pointer to the workspace.
+  // After the workspace is allocated, we point the GEMM destination pointer to the workspace.
   TensorNHWC layout_D{TensorNHWC::packed(options.filter_size)};
   arguments.ref_D.reset(reinterpret_cast<ElementCompute*>(workspace.get()), layout_D);
 
diff --git a/examples/30_wgrad_split_k/CMakeLists.txt b/examples/30_wgrad_split_k/CMakeLists.txt
index 8cc96ce9..3fc5a895 100644
--- a/examples/30_wgrad_split_k/CMakeLists.txt
+++ b/examples/30_wgrad_split_k/CMakeLists.txt
@@ -1,26 +1,34 @@
-# Copyright (c) 2017-2022, NVIDIA CORPORATION.  All rights reserved.
+
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
+
 cutlass_example_add_executable(
   30_wgrad_split_k
   30_wgrad_split_k.cu
diff --git a/examples/31_basic_syrk/CMakeLists.txt b/examples/31_basic_syrk/CMakeLists.txt
new file mode 100644
index 00000000..e6c07750
--- /dev/null
+++ b/examples/31_basic_syrk/CMakeLists.txt
@@ -0,0 +1,35 @@
+
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+cutlass_example_add_executable(
+  31_basic_syrk
+  basic_syrk.cu
+)
+  
diff --git a/examples/31_basic_syrk/basic_syrk.cu b/examples/31_basic_syrk/basic_syrk.cu
new file mode 100644
index 00000000..e207cf73
--- /dev/null
+++ b/examples/31_basic_syrk/basic_syrk.cu
@@ -0,0 +1,522 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*
+  This example demonstrates how to call a CUTLASS SYRK kernel and provides a naive reference
+  matrix multiply kernel to verify its correctness.
+
+  The CUTLASS Syrk template is instantiated in the function CutlassSsyrkNN. This is kernel computes
+  the symmetric rank-k update (SYRK) using double-precision doubleing-point arithmetic and assumes
+  all matrices have column-major layout.
+
+  The threadblock tile size is chosen as 128x128x8 which offers good performance for large matrices.
+  See the CUTLASS Parallel for All blog post for more exposition on the tunable parameters available
+  in CUTLASS.
+
+  https://devblogs.nvidia.com/cutlass-linear-algebra-cuda/
+
+  Aside from defining and launching the SSYRK kernel, this example does not use any other components
+  or utilities within CUTLASS. Such utilities are demonstrated elsewhere in other examples and are
+  prevalent in the CUTLASS unit tests.
+
+*/
+
+// Standard Library includes
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+// Helper methods to check for errors
+#include "helper.h"
+
+//
+// CUTLASS includes needed for double-precision SYRK kernel
+//
+
+// Defines cutlass::gemm::device::Syrk, the generic Syrk computation template class.
+#include "cutlass/gemm/device/rank_k.h"
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// This function defines a CUTLASS SYRK kernel instantiation, constructs its parameters object,
+// and launches it on the CUDA device.
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Define a CUTLASS SYRK template and launch a SYRK kernel.
+cudaError_t CutlassSsyrkNN(
+  int N,
+  int K,
+  double alpha,
+  double const *A,
+  int lda,
+  double beta,
+  double *C,
+  int ldc) {
+
+  // Define type definition for double-precision CUTLASS SYRK with column-major
+  // input matrices and 128x128x8 threadblock tile size (chosen by default).
+  //
+  // To keep the interface manageable, several helpers are defined for plausible compositions
+  // including the following example for double-precision SYRK. Typical values are used as
+  // default template arguments.
+  //
+  // To view the full syrk device API interface, see `cutlass/gemm/device/syrk.h`
+
+  using ColumnMajor = cutlass::layout::ColumnMajor;
+
+  using CutlassSyrk = cutlass::gemm::device::RankK<
+    double,
+    ColumnMajor,
+    double,
+    ColumnMajor,
+    cutlass::FillMode::kLower,
+    double,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<16, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      double,
+      1,
+      double,
+      double
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>,
+    5,     // Stages
+    1,     // AligmentA
+    false, // SplitKSerail
+    cutlass::arch::OpMultiplyAdd, 
+    cutlass::ComplexTransform::kNone, 
+    cutlass::BlasMode::kSymmetric
+  >;
+
+  // Define a CUTLASS SYRK type
+  CutlassSyrk syrk_operator;
+
+  // Construct the CUTLASS SYRK arguments object.
+  //
+  // One of CUTLASS's design patterns is to define syrk argument objects that are constructible
+  // in host code and passed to kernels by value. These may include pointers, strides, scalars,
+  // and other arguments needed by Syrk and its components.
+  //
+  // The benefits of this pattern are (1.) a structured, composable strategy for passing host-constructible
+  // arguments to kernels and (2.) minimized initialization overhead on kernel entry.
+  //
+  CutlassSyrk::Arguments args(cutlass::gemm::GemmUniversalMode::kGemm,
+                              {N, N, K}, // Syrk Problem dimensions
+                              1, // batch_count,
+                              {alpha, beta}, // Scalars used in the Epilogue
+                              reinterpret_cast<void const *>(A),
+                              const_cast<void *>(reinterpret_cast<void *>(C)),
+                              reinterpret_cast<void *>(C), // destination matrix D (may be different memory than so    urce C matrix)
+                              (int64_t)N*K, // Batch strides
+                              (int64_t)N*N,
+                              (int64_t)N*N,
+                              lda,
+                              ldc,
+                              ldc);
+
+  //
+  // Launch the CUTLASS SYRK kernel.
+  //
+  
+  cutlass::Status status = syrk_operator(args);
+
+  //
+  // Return a cudaError_t if the CUTLASS SYRK operator returned an error code.
+  //
+
+  if (status != cutlass::Status::kSuccess) {
+    return cudaErrorUnknown;
+  }
+
+  // Return success, if no errors were encountered.
+  return cudaSuccess;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// The source code after this point in the file is generic CUDA using the CUDA Runtime API
+// and simple CUDA kernels to initialize matrices and compute the general matrix product.
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Kernel to initialize a matrix with small integers.
+__global__ void InitializeMatrix_kernel(
+  double *matrix,
+  int ldm,
+  int rows,
+  int columns,
+  int seed = 0) {
+
+  int i = threadIdx.x + blockIdx.x * blockDim.x;
+  int j = threadIdx.y + blockIdx.y * blockDim.y;
+
+  if (i < rows && j < columns) {
+    int offset = i + j * ldm;
+
+    // Generate arbitrary elements.
+    int const k = 16807;
+    int const m = 16;
+    double value = double(((offset + seed) * k % m) - m / 2);
+
+    matrix[offset] = value;
+  }
+}
+
+/// Simple function to initialize a matrix to arbitrary small integers.
+cudaError_t InitializeMatrix(double *matrix, int ldm, int rows, int columns, int seed = 0) {
+
+  dim3 block(16, 16);
+  dim3 grid(
+    (rows + block.x - 1) / block.x,
+    (columns + block.y - 1) / block.y
+  );
+
+  InitializeMatrix_kernel<<< grid, block >>>(matrix, ldm, rows, columns, seed);
+
+  return cudaGetLastError();
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Allocates device memory for a matrix then fills with arbitrary small integers.
+cudaError_t AllocateMatrix(double **matrix, int ldm, int rows, int columns, int seed = 0) {
+  cudaError_t result;
+
+  size_t sizeof_matrix = sizeof(double) * ldm * columns;
+
+  // Allocate device memory.
+  result = cudaMalloc(reinterpret_cast<void **>(matrix), sizeof_matrix);
+
+  if (result != cudaSuccess) {
+    std::cerr << "Failed to allocate matrix: "
+      << cudaGetErrorString(result) << std::endl;
+    return result;
+  }
+
+  // Clear the allocation.
+  result = cudaMemset(*matrix, 0, sizeof_matrix);
+
+  if (result != cudaSuccess) {
+    std::cerr << "Failed to clear matrix device memory: "
+      << cudaGetErrorString(result) << std::endl;
+    return result;
+  }
+
+  // Initialize matrix elements to arbitrary small integers.
+  result = InitializeMatrix(*matrix, ldm, rows, columns, seed);
+
+  if (result != cudaSuccess) {
+    std::cerr << "Failed to initialize matrix: "
+      << cudaGetErrorString(result) << std::endl;
+    return result;
+  }
+
+  return result;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Naive reference SYRK computation.
+__global__ void ReferenceSyrk_kernel(
+  int N,
+  int K,
+  double alpha,
+  double const *A,
+  int lda,
+  double beta,
+  double *C,
+  int ldc) {
+
+  int i = threadIdx.x + blockIdx.x * blockDim.x;
+  int j = threadIdx.y + blockIdx.y * blockDim.y;
+
+  if (i < N && j < N && i >= j ) { // Since C is in Lower Fill Mode
+    double accumulator = 0;
+
+    for (int k = 0; k < K; ++k) {
+      accumulator += A[i + k * lda] * A[j + k * lda];
+    }
+
+    C[i + j * ldc] = alpha * accumulator + beta * C[i + j * ldc];
+  }
+}
+
+/// Reference SYRK computation.
+cudaError_t ReferenceSyrk(
+  int N,
+  int K,
+  double alpha,
+  double const *A,
+  int lda,
+  double beta,
+  double *C,
+  int ldc) {
+
+  dim3 block(16, 16);
+  dim3 grid(
+    (N + block.x - 1) / block.x,
+    (N + block.y - 1) / block.y
+  );
+
+  ReferenceSyrk_kernel<<< grid, block >>>(N, K, alpha, A, lda, beta, C, ldc);
+
+  return cudaGetLastError();
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Allocate several matrices in GPU device memory and call a double-precision
+/// CUTLASS SYRK kernel.
+cudaError_t TestCutlassSyrk(int N, int K, double alpha, double beta) {
+  cudaError_t result;
+
+  //
+  // Define several matrices to be used as operands to SYRK kernels.
+  //
+
+  // Compute leading dimensions for each matrix.
+  int lda = N;
+  int ldc = N;
+
+  // Compute size in bytes of the C matrix.
+  size_t sizeof_C = sizeof(double) * ldc * N;
+
+  // Define pointers to matrices in GPU device memory.
+  double *A;
+  double *C_cutlass;
+  double *C_reference;
+
+  //
+  // Allocate matrices in GPU device memory with arbitrary seeds.
+  //
+
+  result = AllocateMatrix(&A, lda, N, K, 0);
+
+  if (result !=  cudaSuccess) {
+    return result;
+  }
+
+  result = AllocateMatrix(&C_cutlass, ldc, N, N, 101);
+
+  if (result != cudaSuccess) {
+    cudaFree(A);
+    return result;
+  }
+
+  result = AllocateMatrix(&C_reference, ldc, N, N, 101);
+
+  if (result != cudaSuccess) {
+    cudaFree(A);
+    cudaFree(C_cutlass);
+    return result;
+  }
+
+  result = cudaMemcpy(C_reference, C_cutlass, sizeof_C, cudaMemcpyDeviceToDevice);
+
+  if (result != cudaSuccess) {
+    std::cerr << "Failed to copy C_cutlass matrix to C_reference: "
+      << cudaGetErrorString(result) << std::endl;
+
+    cudaFree(C_reference);
+    cudaFree(C_cutlass);
+    cudaFree(A);
+
+    return result;
+  }
+
+  //
+  // Launch CUTLASS SYRK.
+  //
+
+  result = CutlassSsyrkNN(N, K, alpha, A, lda, beta, C_cutlass, ldc);
+
+  if (result != cudaSuccess) {
+    std::cerr << "CUTLASS SYRK kernel failed: "
+      << cudaGetErrorString(result) << std::endl;
+
+    cudaFree(C_reference);
+    cudaFree(C_cutlass);
+    cudaFree(A);
+
+    return result;
+  }
+
+  //
+  // Verify.
+  //
+
+  // Launch reference SYRK
+  result = ReferenceSyrk(N, K, alpha, A, lda, beta, C_reference, ldc);
+
+  if (result != cudaSuccess) {
+    std::cerr << "Reference SYRK kernel failed: "
+      << cudaGetErrorString(result) << std::endl;
+
+    cudaFree(C_reference);
+    cudaFree(C_cutlass);
+    cudaFree(A);
+
+    return result;
+  }
+
+  // Copy to host and verify equivalence.
+  std::vector<double> host_cutlass(ldc * N, 0);
+  std::vector<double> host_reference(ldc * N, 0);
+
+  result = cudaMemcpy(host_cutlass.data(), C_cutlass, sizeof_C, cudaMemcpyDeviceToHost);
+
+  if (result != cudaSuccess) {
+    std::cerr << "Failed to copy CUTLASS SYRK results: "
+      << cudaGetErrorString(result) << std::endl;
+
+    cudaFree(C_reference);
+    cudaFree(C_cutlass);
+    cudaFree(A);
+
+    return result;
+  }
+
+  result = cudaMemcpy(host_reference.data(), C_reference, sizeof_C, cudaMemcpyDeviceToHost);
+
+  if (result != cudaSuccess) {
+    std::cerr << "Failed to copy Reference SYRK results: "
+      << cudaGetErrorString(result) << std::endl;
+
+    cudaFree(C_reference);
+    cudaFree(C_cutlass);
+    cudaFree(A);
+
+    return result;
+  }
+
+  //
+  // Free device memory allocations.
+  //
+
+  cudaFree(C_reference);
+  cudaFree(C_cutlass);
+  cudaFree(A);
+
+  //
+  // Test for bit equivalence of results.
+  //
+
+  if (host_cutlass != host_reference) {
+    std::cerr << "CUTLASS results incorrect." << std::endl;
+
+    return cudaErrorUnknown;
+  }
+
+  return cudaSuccess;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Entry point to basic_syrk example.
+//
+// usage:
+//
+//   00_basic_syrk <N> <K> <alpha> <beta>
+//
+int main(int argc, const char *arg[]) {
+
+  bool notSupported = false;
+
+  // CUTLASS must be compiled with CUDA 11 Toolkit to run these examples.
+  if (!(__CUDACC_VER_MAJOR__ >= 11)) {
+    std::cerr << "NVIDIA  Ampere Tensor Core operations must be compiled with CUDA 11.0 Toolkit or later." << std::endl;
+    notSupported = true;
+  }
+
+  cudaDeviceProp props;
+
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+
+    return -1;
+  }
+
+  if (!((props.major * 10 + props.minor) >= 80)) {
+
+    std::cerr << "This example requires compute capability at least 80."
+              << std::endl;
+    notSupported = true;
+  }
+
+  if (notSupported) {
+    return 0;
+  }
+
+  //
+  // Parse the command line to obtain SYRK dimensions and scalar values.
+  //
+
+  // SYRK problem dimensions.
+  int problem[2] = { 128, 128 };
+
+  for (int i = 1; i < argc && i < 3; ++i) {
+    std::stringstream ss(arg[i]);
+    ss >> problem[i - 1];
+  }
+
+  // Scalars used for linear scaling the result of the matrix product.
+  double scalars[2] = { 1, 0 };
+
+  for (int i = 3; i < argc && i < 5; ++i) {
+    std::stringstream ss(arg[i]);
+    ss >> scalars[i - 3];
+  }
+
+  //
+  // Run the CUTLASS SYRK test.
+  //
+
+  cudaError_t result = TestCutlassSyrk(
+    problem[0],     // SYRK N dimension
+    problem[1],     // SYRK K dimension
+    scalars[0],     // alpha
+    scalars[1]      // beta
+  );
+
+  if (result == cudaSuccess) {
+    std::cout << "Passed." << std::endl;
+  }
+
+  // Exit.
+  return result == cudaSuccess ? 0 : -1;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/32_basic_trmm/CMakeLists.txt b/examples/32_basic_trmm/CMakeLists.txt
new file mode 100644
index 00000000..0e1afff1
--- /dev/null
+++ b/examples/32_basic_trmm/CMakeLists.txt
@@ -0,0 +1,35 @@
+
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+cutlass_example_add_executable(
+  32_basic_trmm
+  basic_trmm.cu
+)
+  
diff --git a/examples/32_basic_trmm/basic_trmm.cu b/examples/32_basic_trmm/basic_trmm.cu
new file mode 100644
index 00000000..54ac11ff
--- /dev/null
+++ b/examples/32_basic_trmm/basic_trmm.cu
@@ -0,0 +1,550 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*
+  This example demonstrates how to call a CUTLASS TRMM kernel and provides a naive reference
+  matrix multiply kernel to verify its correctness.
+
+  The CUTLASS Trmm template is instantiated in the function CutlassStrmmNN. This is kernel computes
+  the triangular matrix product (TRMM) using double-precision doubleing-point arithmetic and assumes
+  all matrices have column-major layout.
+
+  The threadblock tile size is chosen as 128x128x8 which offers good performance for large matrices.
+  See the CUTLASS Parallel for All blog post for more exposition on the tunable parameters available
+  in CUTLASS.
+
+  https://devblogs.nvidia.com/cutlass-linear-algebra-cuda/
+
+  Aside from defining and launching the STRMM kernel, this example does not use any other components
+  or utilities within CUTLASS. Such utilities are demonstrated elsewhere in other examples and are
+  prevalent in the CUTLASS unit tests.
+
+*/
+
+// Standard Library includes
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+// Helper methods to check for errors
+#include "helper.h"
+
+//
+// CUTLASS includes needed for double-precision TRMM kernel
+//
+
+// Defines cutlass::gemm::device::Trmm, the generic Trmm computation template class.
+#include "cutlass/gemm/device/trmm.h"
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// This function defines a CUTLASS TRMM kernel instantiation, constructs its parameters object,
+// and launches it on the CUDA device.
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Define a CUTLASS TRMM template and launch a TRMM kernel.
+cudaError_t CutlassStrmmNN(
+  int M,
+  int N,
+  double alpha,
+  double const *A,
+  int lda,
+  double const *B,
+  int ldb,
+  double *C,
+  int ldc) {
+
+  // Define type definition for double-precision CUTLASS TRMM with column-major
+  // input matrices and 128x128x8 threadblock tile size (chosen by default).
+  //
+  // To keep the interface manageable, several helpers are defined for plausible compositions
+  // including the following example for double-precision TRMM. Typical values are used as
+  // default template arguments.
+  //
+  // To view the full trmm device API interface, see `cutlass/gemm/device/trmm.h`
+
+  using ColumnMajor = cutlass::layout::ColumnMajor;
+
+  using CutlassTrmm = cutlass::gemm::device::Trmm<
+    double,
+    ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kNonUnit,
+    double,
+    ColumnMajor,
+    double,
+    ColumnMajor,
+    double,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64,64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      double,
+      1,
+      double,
+      double,
+      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling 
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    5,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAdd
+  >;
+
+  // Define a CUTLASS TRMM type
+  CutlassTrmm trmm_operator;
+
+  // Construct the CUTLASS TRMM arguments object.
+  //
+  // One of CUTLASS's design patterns is to define trmm argument objects that are constructible
+  // in host code and passed to kernels by value. These may include pointers, strides, scalars,
+  // and other arguments needed by Trmm and its components.
+  //
+  // The benefits of this pattern are (1.) a structured, composable strategy for passing host-constructible
+  // arguments to kernels and (2.) minimized initialization overhead on kernel entry.
+  //
+  CutlassTrmm::Arguments args(cutlass::gemm::GemmUniversalMode::kGemm,
+                              {M, N, M}, // Trmm Problem dimensions in Left-Side Mode
+                              1, // batch_count,
+                              {alpha}, // Scalars used in the Epilogue
+                              reinterpret_cast<void const *>(A),
+                              reinterpret_cast<void const *>(B),
+                              reinterpret_cast<void *>(C), // destination matrix D (may be different memory than so    urce C matrix)
+                              (int64_t)M*M, // Batch strides
+                              (int64_t)M*N,
+                              (int64_t)M*N,
+                              lda,
+                              ldb,
+                              ldc);
+
+  //
+  // Launch the CUTLASS TRMM kernel.
+  //
+  
+  cutlass::Status status = trmm_operator(args);
+
+  //
+  // Return a cudaError_t if the CUTLASS TRMM operator returned an error code.
+  //
+
+  if (status != cutlass::Status::kSuccess) {
+    return cudaErrorUnknown;
+  }
+
+  // Return success, if no errors were encountered.
+  return cudaSuccess;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// The source code after this point in the file is generic CUDA using the CUDA Runtime API
+// and simple CUDA kernels to initialize matrices and compute the general matrix product.
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Kernel to initialize a matrix with small integers.
+__global__ void InitializeMatrix_kernel(
+  double *matrix,
+  int ldm,
+  int rows,
+  int columns,
+  int seed = 0,
+  cutlass::FillMode fill_mode = cutlass::FillMode::kInvalid) {
+
+  int i = threadIdx.x + blockIdx.x * blockDim.x;
+  int j = threadIdx.y + blockIdx.y * blockDim.y;
+
+  if (i < rows && j < columns) {
+    if (fill_mode == cutlass::FillMode::kLower && i < j) return;
+    else if (fill_mode == cutlass::FillMode::kUpper && i > j) return;
+    int offset = i + j * ldm;
+
+    // Generate arbitrary elements.
+    int const k = 16807;
+    int const m = 16;
+    double value = double(((offset + seed) * k % m) - m / 2);
+
+    matrix[offset] = value;
+  }
+}
+
+/// Simple function to initialize a matrix to arbitrary small integers.
+cudaError_t InitializeMatrix(double *matrix, int ldm, int rows, int columns, int seed = 0,
+                             cutlass::FillMode fill_mode = cutlass::FillMode::kInvalid) {
+
+  dim3 block(16, 16);
+  dim3 grid(
+    (rows + block.x - 1) / block.x,
+    (columns + block.y - 1) / block.y
+  );
+
+  InitializeMatrix_kernel<<< grid, block >>>(matrix, ldm, rows, columns, seed, fill_mode);
+
+  return cudaGetLastError();
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Allocates device memory for a matrix then fills with arbitrary small integers.
+cudaError_t AllocateMatrix(double **matrix, int ldm, int rows, int columns, int seed = 0,
+                           cutlass::FillMode fill_mode = cutlass::FillMode::kInvalid) {
+  cudaError_t result;
+
+  size_t sizeof_matrix = sizeof(double) * ldm * columns;
+
+  // Allocate device memory.
+  result = cudaMalloc(reinterpret_cast<void **>(matrix), sizeof_matrix);
+
+  if (result != cudaSuccess) {
+    std::cerr << "Failed to allocate matrix: "
+      << cudaGetErrorString(result) << std::endl;
+    return result;
+  }
+
+  // Clear the allocation.
+  result = cudaMemset(*matrix, 0, sizeof_matrix);
+
+  if (result != cudaSuccess) {
+    std::cerr << "Failed to clear matrix device memory: "
+      << cudaGetErrorString(result) << std::endl;
+    return result;
+  }
+
+  // Initialize matrix elements to arbitrary small integers.
+  result = InitializeMatrix(*matrix, ldm, rows, columns, seed, fill_mode);
+
+  if (result != cudaSuccess) {
+    std::cerr << "Failed to initialize matrix: "
+      << cudaGetErrorString(result) << std::endl;
+    return result;
+  }
+
+  return result;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Naive reference TRMM computation.
+__global__ void ReferenceTrmm_kernel(
+  int M,
+  int N,
+  double alpha,
+  double const *A,
+  int lda,
+  double const *B,
+  int ldb,
+  double *C,
+  int ldc) {
+
+  int i = threadIdx.x + blockIdx.x * blockDim.x;
+  int j = threadIdx.y + blockIdx.y * blockDim.y;
+
+  if (i < M && j < N) {
+    double accumulator = 0;
+
+    for (int k = 0; k < M; ++k) {
+      accumulator += A[i + k * lda] * B[k + j * ldb]; // Since A is in Left-Side Mode
+    }
+
+    C[i + j * ldc] = alpha * accumulator;
+  }
+}
+
+/// Reference TRMM computation.
+cudaError_t ReferenceTrmm(
+  int M,
+  int N,
+  double alpha,
+  double const *A,
+  int lda,
+  double const *B,
+  int ldb,
+  double *C,
+  int ldc) {
+
+  dim3 block(16, 16);
+  dim3 grid(
+    (M + block.x - 1) / block.x,
+    (N + block.y - 1) / block.y
+  );
+
+  ReferenceTrmm_kernel<<< grid, block >>>(M, N, alpha, A, lda, B, ldb, C, ldc);
+
+  return cudaGetLastError();
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Allocate several matrices in GPU device memory and call a double-precision
+/// CUTLASS TRMM kernel.
+cudaError_t TestCutlassTrmm(int M, int N, double alpha) {
+  cudaError_t result;
+
+  //
+  // Define several matrices to be used as operands to TRMM kernels.
+  //
+
+  // Compute leading dimensions for each matrix.
+  int lda = M;
+  int ldb = M;
+  int ldc = M;
+
+  // Compute size in bytes of the C matrix.
+  size_t sizeof_C = sizeof(double) * ldc * N;
+
+  // Define pointers to matrices in GPU device memory.
+  double *A;
+  double *B;
+  double *C_cutlass;
+  double *C_reference;
+
+  //
+  // Allocate matrices in GPU device memory with arbitrary seeds.
+  //
+
+  result = AllocateMatrix(&A, lda, M, M, 0, cutlass::FillMode::kLower);
+
+  if (result !=  cudaSuccess) {
+    return result;
+  }
+
+  result = AllocateMatrix(&B, ldb, M, N, 17);
+
+  if (result !=  cudaSuccess) {
+    cudaFree(A);
+    return result;
+  }
+
+  result = AllocateMatrix(&C_cutlass, ldc, M, N, 101);
+
+  if (result != cudaSuccess) {
+    cudaFree(A);
+    cudaFree(B);
+    return result;
+  }
+
+  result = AllocateMatrix(&C_reference, ldc, M, N, 101);
+
+  if (result != cudaSuccess) {
+    cudaFree(A);
+    cudaFree(B);
+    cudaFree(C_cutlass);
+    return result;
+  }
+
+  result = cudaMemcpy(C_reference, C_cutlass, sizeof_C, cudaMemcpyDeviceToDevice);
+
+  if (result != cudaSuccess) {
+    std::cerr << "Failed to copy C_cutlass matrix to C_reference: "
+      << cudaGetErrorString(result) << std::endl;
+
+    cudaFree(C_reference);
+    cudaFree(C_cutlass);
+    cudaFree(B);
+    cudaFree(A);
+
+    return result;
+  }
+
+  //
+  // Launch CUTLASS TRMM.
+  //
+
+  result = CutlassStrmmNN(M, N, alpha, A, lda, B, ldb, C_cutlass, ldc);
+
+  if (result != cudaSuccess) {
+    std::cerr << "CUTLASS TRMM kernel failed: "
+      << cudaGetErrorString(result) << std::endl;
+
+    cudaFree(C_reference);
+    cudaFree(C_cutlass);
+    cudaFree(B);
+    cudaFree(A);
+
+    return result;
+  }
+
+  //
+  // Verify.
+  //
+
+  // Launch reference TRMM
+  result = ReferenceTrmm(M, N, alpha, A, lda, B, ldb, C_reference, ldc);
+
+  if (result != cudaSuccess) {
+    std::cerr << "Reference TRMM kernel failed: "
+      << cudaGetErrorString(result) << std::endl;
+
+    cudaFree(C_reference);
+    cudaFree(C_cutlass);
+    cudaFree(B);
+    cudaFree(A);
+
+    return result;
+  }
+
+  // Copy to host and verify equivalence.
+  std::vector<double> host_cutlass(ldc * N, 0);
+  std::vector<double> host_reference(ldc * N, 0);
+
+  result = cudaMemcpy(host_cutlass.data(), C_cutlass, sizeof_C, cudaMemcpyDeviceToHost);
+
+  if (result != cudaSuccess) {
+    std::cerr << "Failed to copy CUTLASS TRMM results: "
+      << cudaGetErrorString(result) << std::endl;
+
+    cudaFree(C_reference);
+    cudaFree(C_cutlass);
+    cudaFree(B);
+    cudaFree(A);
+
+    return result;
+  }
+
+  result = cudaMemcpy(host_reference.data(), C_reference, sizeof_C, cudaMemcpyDeviceToHost);
+
+  if (result != cudaSuccess) {
+    std::cerr << "Failed to copy Reference TRMM results: "
+      << cudaGetErrorString(result) << std::endl;
+
+    cudaFree(C_reference);
+    cudaFree(C_cutlass);
+    cudaFree(B);
+    cudaFree(A);
+
+    return result;
+  }
+
+  //
+  // Free device memory allocations.
+  //
+
+  cudaFree(C_reference);
+  cudaFree(C_cutlass);
+  cudaFree(B);
+  cudaFree(A);
+
+  //
+  // Test for bit equivalence of results.
+  //
+
+  if (host_cutlass != host_reference) {
+    std::cerr << "CUTLASS results incorrect." << std::endl;
+
+    return cudaErrorUnknown;
+  }
+
+  return cudaSuccess;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Entry point to basic_trmm example.
+//
+// usage:
+//
+//   00_basic_trmm <M> <N> <alpha> 
+//
+int main(int argc, const char *arg[]) {
+
+  bool notSupported = false;
+
+  // CUTLASS must be compiled with CUDA 11 Toolkit to run these examples.
+  if (!(__CUDACC_VER_MAJOR__ >= 11)) {
+    std::cerr << "NVIDIA  Ampere Tensor Core operations must be compiled with CUDA 11.0 Toolkit or later." << std::endl;
+    notSupported = true;
+  }
+
+  cudaDeviceProp props;
+
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+
+    return -1;
+  }
+
+  if (!((props.major * 10 + props.minor) >= 80)) {
+
+    std::cerr << "This example requires compute capability at least 80."
+              << std::endl;
+    notSupported = true;
+  }
+
+  if (notSupported) {
+    return 0;
+  }
+
+  //
+  // Parse the command line to obtain TRMM dimensions and scalar values.
+  //
+
+  // TRMM problem dimensions.
+  int problem[2] = { 128, 128 };
+
+  for (int i = 1; i < argc && i < 3; ++i) {
+    std::stringstream ss(arg[i]);
+    ss >> problem[i - 1];
+  }
+
+  // Scalars used for linear scaling the result of the matrix product.
+  double scalars[1] = { 1 };
+
+  for (int i = 3; i < argc && i < 4; ++i) {
+    std::stringstream ss(arg[i]);
+    ss >> scalars[i - 3];
+  }
+
+  //
+  // Run the CUTLASS TRMM test.
+  //
+
+  cudaError_t result = TestCutlassTrmm(
+    problem[0],     // TRMM M dimension
+    problem[1],     // TRMM N dimension
+    scalars[0]     // alpha
+  );
+
+  if (result == cudaSuccess) {
+    std::cout << "Passed." << std::endl;
+  }
+
+  // Exit.
+  return result == cudaSuccess ? 0 : -1;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/33_ampere_3xtf32_tensorop_symm/CMakeLists.txt b/examples/33_ampere_3xtf32_tensorop_symm/CMakeLists.txt
new file mode 100644
index 00000000..858378da
--- /dev/null
+++ b/examples/33_ampere_3xtf32_tensorop_symm/CMakeLists.txt
@@ -0,0 +1,35 @@
+
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+cutlass_example_add_executable(
+  33_ampere_3xtf32_tensorop_symm
+  ampere_3xtf32_tensorop_symm.cu
+  )
+
diff --git a/examples/33_ampere_3xtf32_tensorop_symm/ampere_3xtf32_tensorop_symm.cu b/examples/33_ampere_3xtf32_tensorop_symm/ampere_3xtf32_tensorop_symm.cu
new file mode 100644
index 00000000..2e875a91
--- /dev/null
+++ b/examples/33_ampere_3xtf32_tensorop_symm/ampere_3xtf32_tensorop_symm.cu
@@ -0,0 +1,687 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/**
+NVIDIA Ampere architecture starts supporting tfloat32 (see include/cutlass/tfloat32.h)
+data types in tensor cores.  One big advantage is that we can load in F32 data and convert them
+implicitly to tf32 inside the SYMM kernel which means no change is needed to accelerate traditional
+F32 data by using NVIDIA Ampere architecture.
+
+We can use the tf32 mode of tensor core to emulate a fast accurate SYMM kernel which is accelerated
+using Ampere Tensor Cores (see include/cutlass/gemm/warp/mma_tensor_op_fast_f32.h). 
+
+The trick is very simple
+  a x b = (a_big + a_small) x (b_big + b_small) = a_big x b_big + a_big x b_small + a_small x b_big
+  big = convert_to_tf32(F32)
+  small = convert_to_tf32(F32 - big)
+
+a_small x b_small is discarded because they are too small.
+
+This example demonstrates usage of this kernel, along with accuracy measurements w.r.t. actual F32 
+results (SSYMM from cuBLAS) and against F64 results (DSYMM from CUTLASS)
+
+To enable this feature, the only change needs to make is to change the default OpMultiplyAdd to 
+OpMultiplyAddFastF32. 
+
+Now, we have two different flavors of SSYMM in the profiler for Ampere:
+
+  s1688symm       // Use 3xTF32 to emulate F32.  F32 in, converted in TF32-big and TF32-small internally,
+                  // accumulated in F32, F32 out.
+  s1688tf32symm   // Use 1xTF32.  F32 in, converted to one TF32 internally, accumulated in F32, F32 out.
+*/
+
+#include <iostream>
+#include <vector>
+#include <limits>
+
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/symm.h"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/host_tensor.h"
+
+#include "cutlass/util/reference/host/symm.h"
+#include "cutlass/util/reference/host/tensor_reduce.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/error_metrics.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "helper.h"
+
+#if CUTLASS_ENABLE_CUBLAS
+#include <cublas_v2.h>
+#endif
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+
+  bool help;
+
+  cutlass::gemm::GemmCoord problem_size;
+  float alpha;
+  float beta;
+  std::string rand_mode;
+  int seed;
+  
+  Options():
+    help(false),
+    problem_size({4096, 4096, 4096}),
+    seed(1),
+    alpha(1),
+    beta(),
+    rand_mode("uniform") { }
+
+  bool valid() {
+    //
+    // CUTLASS attempts to load 128b vectors of F32 elements. Consequently,
+    // all pointers, strides, and tensor extents must be divisible by 4 elements.
+    //
+    int const kAlignment = 4;
+
+    if ((problem_size.m() % kAlignment) ||
+      (problem_size.n() % kAlignment) ||
+      (problem_size.k() % kAlignment)) {
+
+      // misaligned tensors
+      return false;
+    }
+
+    return true;
+  }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+    }
+
+    cmd.get_cmd_line_argument("m", problem_size.m());
+    cmd.get_cmd_line_argument("n", problem_size.n());
+    // Since the kernels in this example are in Left Side Mode
+    cmd.get_cmd_line_argument("m", problem_size.k());
+
+    cmd.get_cmd_line_argument("alpha", alpha);
+    cmd.get_cmd_line_argument("beta", beta);
+    
+    cmd.get_cmd_line_argument("seed", seed);
+    cmd.get_cmd_line_argument("rand_mode", rand_mode);
+
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "33_ampere_3xtf32_tensorop_symm example\n\n"
+      << "  This example uses the CUTLASS Library to execute 3xTF32 tensorop SYMM computations.\n\n"
+      << "Options:\n\n"
+      << "  --help                      If specified, displays this usage statement.\n\n"
+      << "  --m=<int>                   SYMM M dimension\n"
+      << "  --n=<int>                   SYMM N dimension\n"
+      << "  --alpha=<f32>               Epilogue scalar alpha\n"
+      << "  --beta=<f32>                Epilogue scalar beta\n\n"
+      << "  --rand_mode=<string>        gauss / uniform*\n\n"
+      << "  --seed=<int>                Random number seed (1*)\n\n";
+
+    out << "\n\nExamples:\n\n"
+      << "$ ./examples/33_ampere_3xtf32_tensorop_symm/33_ampere_3xtf32_tensorop_symm --m=1024 --n=512 \\\n"
+      << "     --alpha=2 --beta=1 \n\n";
+
+    return out;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// The code section below describes matrix layout of input and output matrices. Column Major for
+// Matrix A, Matrix B and Matrix C (since that's what cuBLAS supports, CUTLASS supports Row Major too)
+using LayoutInputA = cutlass::layout::ColumnMajor;
+using LayoutInputB = cutlass::layout::ColumnMajor;
+using LayoutOutput = cutlass::layout::ColumnMajor;
+
+// Symmetric Matrix A is in Left Side mode
+constexpr cutlass::SideMode SideModeA = cutlass::SideMode::kLeft;
+// Symmetric Matrix A is in Lower Filled mode
+constexpr cutlass::FillMode FillModeA = cutlass::FillMode::kLower;
+
+// This code section describes whether you want to use tensor cores or regular SIMT cores on GPU SM
+using MMAOp = cutlass::arch::OpClassTensorOp;
+
+// This code section describes CUDA SM architecture number
+using SmArch = cutlass::arch::Sm80;
+
+// This code section describes the tile size a thread block will compute
+using ShapeMMAThreadBlock =
+    cutlass::gemm::GemmShape<128, 64, 16>;  // <- threadblock tile M = 128, N = 128, K = 16
+// This code section describes tile size a warp will compute
+using ShapeMMAWarp = cutlass::gemm::GemmShape<64, 32, 16>;  // <- warp tile M = 64, N = 64, K = 16
+// This code section describes the size of MMA op
+using ShapeMMAOp = cutlass::gemm::GemmShape<16, 8, 8>;  // <- MMA Op tile M = 16, N = 8, K = 8
+
+// This code section describes how threadblocks are scheduled on GPU
+using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;  // <- ??
+
+// This code section describes the epilogue part of the kernel
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
+    float,                                     // <- data type of output matrix
+    128 / cutlass::sizeof_bits<float>::value,  // <- the number of elements per vectorized
+                                                       // memory access. For a byte, it's 16
+                                                       // elements. This becomes the vector width of
+                                                       // math instructions in the epilogue too
+    float,                                   // <- data type of accumulator
+    float>;                                  // <- data type for alpha/beta in linear combination function
+
+// Number of pipelines you want to use
+constexpr int NumStages = 3;
+// Alignment 
+constexpr int Alignment = 4;
+
+// 
+// CUTLASS Symm Operators (SSYM: Symm_3xTF32, Symm_1xTF32, DSYMM: Symm_F64)
+//
+
+// Symm_3xTF32
+using Symm_3xTF32 = cutlass::gemm::device::Symm<
+                                              float,
+                                              LayoutInputA,
+                                              SideModeA,
+                                              FillModeA,
+                                              float,
+                                              LayoutInputB,
+                                              float,
+                                              LayoutOutput,
+                                              float,
+                                              MMAOp,
+                                              SmArch,
+                                              ShapeMMAThreadBlock,
+                                              ShapeMMAWarp,
+                                              ShapeMMAOp,
+                                              EpilogueOp,
+                                              SwizzleThreadBlock,
+                                              NumStages,
+                                              1, // Symmetric matrix is always align 1 
+                                              Alignment,
+                                              false,
+                                              cutlass::arch::OpMultiplyAddFastF32>;
+
+// Symm_1xTF32
+using Symm_1xTF32 = cutlass::gemm::device::Symm<
+                                              float,
+                                              LayoutInputA,
+                                              SideModeA,
+                                              FillModeA,
+                                              float,
+                                              LayoutInputB,
+                                              float,
+                                              LayoutOutput,
+                                              float,
+                                              MMAOp,
+                                              SmArch,
+                                              ShapeMMAThreadBlock,
+                                              ShapeMMAWarp,
+                                              ShapeMMAOp,
+                                              EpilogueOp,
+                                              SwizzleThreadBlock,
+                                              NumStages,
+                                              1, // Symmetric matrix is always align 1 
+                                              Alignment,
+                                              false,
+                                              cutlass::arch::OpMultiplyAdd>;
+
+// Symm_F64
+using Symm_F64 = cutlass::gemm::device::Symm<
+                                              double,
+                                              LayoutInputA,
+                                              SideModeA,
+                                              FillModeA,
+                                              double,
+                                              LayoutInputB,
+                                              double,
+                                              LayoutOutput,
+                                              double,
+                                              cutlass::arch::OpClassTensorOp,
+                                              cutlass::arch::Sm80,
+                                              cutlass::gemm::GemmShape<32, 32, 16>,
+                                              cutlass::gemm::GemmShape<16, 16, 16>,
+                                              cutlass::gemm::GemmShape<8, 8, 4>,
+                                              cutlass::epilogue::thread::LinearCombination<
+                                                double,
+                                                1,
+                                                double,
+                                                double
+                                              >,
+                                              cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+                                              4>;
+
+bool run(Options &options) {
+
+  // Create a tuple of problem size for matrix multiplication
+  cutlass::gemm::GemmCoord problem_size = options.problem_size;
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// 1. Initialize F32 Precision input tensors using CUTLASS helper functions
+  ////////////////////////////////////////////////////////////////////////////////
+  cutlass::HostTensor<float, LayoutInputA> tensor_a_F32(problem_size.mk());  // <- Create matrix A with dimensions M x K
+  cutlass::HostTensor<float, LayoutInputB> tensor_b_F32(problem_size.kn());  // <- Create matrix B with dimensions K x N
+  cutlass::HostTensor<float, LayoutOutput> tensor_c_F32(problem_size.mn());  // <- Create matrix C with dimensions M x N
+  cutlass::HostTensor<float, LayoutOutput> tensor_d_F32(problem_size.mn());  // <- Create matrix D with dimensions M x N 
+
+  if (options.rand_mode == "uniform") {
+    const float min = -1;
+    const float max =  1;
+    // Fill input and output matrices on host using CUTLASS helper functions
+    cutlass::reference::host::TensorFillRandomUniform(
+        tensor_a_F32.host_view(),
+        options.seed,
+        double(max),
+        double(min));      // <- Fill matrix A on host with uniform-distribution random data
+    cutlass::reference::host::TensorFillRandomUniform(
+        tensor_b_F32.host_view(),
+        options.seed,
+        double(max),
+        double(min));      // <- Fill matrix B on host with uniform-distribution random data
+    cutlass::reference::host::TensorFillRandomUniform(
+        tensor_c_F32.host_view(),
+        options.seed,
+        double(max),
+        double(min));      // <- Fill matrix C on host with uniform-distribution random data
+  } else if (options.rand_mode == "gauss") {
+    // Fill input and output matrices on host using CUTLASS helper functions
+    cutlass::reference::host::TensorFillRandomGaussian(
+        tensor_a_F32.host_view(),
+        options.seed,
+        double(0),
+        double(5));      // <- Fill matrix A on host with gaussian-distribution random data
+    cutlass::reference::host::TensorFillRandomGaussian(
+        tensor_b_F32.host_view(),
+        options.seed,
+        double(0),
+        double(5));      // <- Fill matrix B on host with gaussian-distribution random data
+    cutlass::reference::host::TensorFillRandomGaussian(
+        tensor_c_F32.host_view(),
+        options.seed,
+        double(0),
+        double(5));      // <- Fill matrix C on host with gaussian-distribution random data
+  }
+  cutlass::reference::host::TensorFill(
+      tensor_d_F32.host_view());  // <- fill matrix D on host with zeros
+  
+  // Copy data from host to GPU
+  tensor_a_F32.sync_device();
+  tensor_b_F32.sync_device();
+  tensor_c_F32.sync_device();
+  tensor_d_F32.sync_device();
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// 2. Initialize F64 tensors, Output tensors and setup arguments
+  ////////////////////////////////////////////////////////////////////////////////
+  // Symm F64 input operands (A, B, C)
+  cutlass::HostTensor<double, LayoutInputA> tensor_a_F64(problem_size.mk());  // <- Create matrix A with dimensions M x K
+  cutlass::HostTensor<double, LayoutInputB> tensor_b_F64(problem_size.kn());  // <- Create matrix B with dimensions K x N
+  cutlass::HostTensor<double, LayoutOutput> tensor_c_F64(problem_size.mn());  // <- Create matrix C with dimensions M x N
+  
+  // Symm output (D) for SYMM_3xTF32
+  cutlass::HostTensor<float, LayoutOutput> tensor_d_3xTF32(problem_size.mn());  // <- Create matrix D with dimensions M x N
+  // Symm output (D) for SYMM_1xTF32
+  cutlass::HostTensor<float, LayoutOutput> tensor_d_1xTF32(problem_size.mn());  // <- Create matrix D with dimensions M x N
+  // Symm output (D) for SYMM_F64
+  cutlass::HostTensor<double, LayoutOutput> tensor_d_F64(problem_size.mn());  // <- Create matrix D with dimensions M x N
+#if CUTLASS_ENABLE_CUBLAS
+  // Symm output (D) for SYMM_cublasF32
+  cutlass::HostTensor<float, LayoutOutput> tensor_d_cublasF32(problem_size.mn());  // <- Create matrix D with dimensions M x N
+#endif
+
+  // Copy values from the DP tensors
+  cutlass::reference::host::TensorCopy(tensor_a_F64.host_view(), tensor_a_F32.host_view());
+  cutlass::reference::host::TensorCopy(tensor_b_F64.host_view(), tensor_b_F32.host_view());
+  cutlass::reference::host::TensorCopy(tensor_c_F64.host_view(), tensor_c_F32.host_view());
+  cutlass::reference::host::TensorCopy(tensor_d_F64.host_view(), tensor_d_F32.host_view());
+  cutlass::reference::host::TensorCopy(tensor_d_3xTF32.host_view(), tensor_d_F32.host_view());
+  cutlass::reference::host::TensorCopy(tensor_d_1xTF32.host_view(), tensor_d_F32.host_view());
+#if CUTLASS_ENABLE_CUBLAS
+  cutlass::reference::host::TensorCopy(tensor_d_cublasF32.host_view(), tensor_d_F32.host_view());
+#endif
+  
+  // Copy data from host to GPU
+  tensor_a_F64.sync_device();
+  tensor_b_F64.sync_device();
+  tensor_c_F64.sync_device();
+  tensor_d_F64.sync_device();
+  tensor_d_3xTF32.sync_device();
+  tensor_d_1xTF32.sync_device();
+#if CUTLASS_ENABLE_CUBLAS
+  tensor_d_cublasF32.sync_device();
+#endif
+
+  // Initialize alpha and beta for dot product computation
+  float alpha = float(options.alpha);
+  float beta =  float(options.beta);
+
+  // Batch count as 1
+  int batch_count = 1;
+
+  // Batch stride for A, when matrix A is in Left Side mode
+  int batch_stride_A = problem_size.m()*problem_size.m();
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// 3. Run 3xTF32 kernel
+  ////////////////////////////////////////////////////////////////////////////////
+  // Create a tuple of gemm kernel arguments. This is later passed as arguments to launch
+  // instantiated CUTLASS kernel
+  typename Symm_3xTF32::Arguments arguments_3xtf32{
+                                     cutlass::gemm::GemmUniversalMode::kGemm,
+                                     problem_size,                  // <- problem size of matrix multiplication
+                                     batch_count,                   // <- batch count
+                                     {alpha, beta},                 // <- tuple of alpha and beta
+                                     tensor_a_F32.device_data(),    // <- reference to matrix A on device
+                                     tensor_b_F32.device_data(),    // <- reference to matrix B on device
+                                     tensor_c_F32.device_data(),    // <- reference to matrix C on device
+                                     tensor_d_3xTF32.device_data(), // <- reference to matrix D on device
+                                     batch_stride_A,                // <- batch stride and ld for matrices
+                                     problem_size.m() * problem_size.n(),
+                                     problem_size.m() * problem_size.n(),
+                                     problem_size.m() * problem_size.n(),
+                                     tensor_a_F32.layout().stride(0),
+                                     tensor_b_F32.layout().stride(0),
+                                     tensor_c_F32.layout().stride(0),
+                                     tensor_d_3xTF32.layout().stride(0)
+                                     };
+
+  // Using the arguments, query for extra workspace required for matrix multiplication computation
+  size_t workspace_size_3xtf32 = Symm_3xTF32::get_workspace_size(arguments_3xtf32);
+
+  // Allocate workspace memory
+  cutlass::device_memory::allocation<uint8_t> workspace_3xtf32(workspace_size_3xtf32);
+
+  // Instantiate CUTLASS kernel depending on templates
+  Symm_3xTF32 symm_op_3xtf32;
+
+  // Check the problem size is supported or not 
+  cutlass::Status status_3xtf32 = symm_op_3xtf32.can_implement(arguments_3xtf32);
+  CUTLASS_CHECK(status_3xtf32);
+
+  // Initialize CUTLASS kernel with arguments and workspace pointer
+  status_3xtf32 = symm_op_3xtf32.initialize(arguments_3xtf32, workspace_3xtf32.get());
+  CUTLASS_CHECK(status_3xtf32);
+
+  // Launch initialized CUTLASS kernel
+  status_3xtf32 = symm_op_3xtf32();
+  CUTLASS_CHECK(status_3xtf32);
+
+  tensor_d_3xTF32.sync_host();
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// 4. Run 1xTF32 kernel
+  ////////////////////////////////////////////////////////////////////////////////
+  // Create a tuple of gemm kernel arguments. This is later passed as arguments to launch
+  // instantiated CUTLASS kernel
+  typename Symm_1xTF32::Arguments arguments_1xtf32{
+                                     cutlass::gemm::GemmUniversalMode::kGemm,
+                                     problem_size,                  // <- problem size of matrix multiplication
+                                     batch_count,                   // <- batch count
+                                     {alpha, beta},                 // <- tuple of alpha and beta
+                                     tensor_a_F32.device_data(),    // <- reference to matrix A on device
+                                     tensor_b_F32.device_data(),    // <- reference to matrix B on device
+                                     tensor_c_F32.device_data(),    // <- reference to matrix C on device
+                                     tensor_d_1xTF32.device_data(), // <- reference to matrix D on device
+                                     batch_stride_A,                // <- batch stride and ld for matrices
+                                     problem_size.m() * problem_size.n(),
+                                     problem_size.m() * problem_size.n(),
+                                     problem_size.m() * problem_size.n(),
+                                     tensor_a_F32.layout().stride(0),
+                                     tensor_b_F32.layout().stride(0),
+                                     tensor_c_F32.layout().stride(0),
+                                     tensor_d_1xTF32.layout().stride(0)
+                                     };
+
+  // Using the arguments, query for extra workspace required for matrix multiplication computation
+  size_t workspace_size_1xtf32 = Symm_1xTF32::get_workspace_size(arguments_1xtf32);
+
+  // Allocate workspace memory
+  cutlass::device_memory::allocation<uint8_t> workspace_1xtf32(workspace_size_1xtf32);
+
+  // Instantiate CUTLASS kernel depending on templates
+  Symm_1xTF32 symm_op_1xtf32;
+
+  // Check the problem size is supported or not 
+  cutlass::Status status_1xtf32 = symm_op_1xtf32.can_implement(arguments_1xtf32);
+  CUTLASS_CHECK(status_1xtf32);
+
+  // Initialize CUTLASS kernel with arguments and workspace pointer
+  status_1xtf32 = symm_op_1xtf32.initialize(arguments_1xtf32, workspace_1xtf32.get());
+  CUTLASS_CHECK(status_1xtf32);
+
+  // Launch initialized CUTLASS kernel
+  status_1xtf32 = symm_op_1xtf32();
+  CUTLASS_CHECK(status_1xtf32);
+
+  tensor_d_1xTF32.sync_host();
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// 5. Run F64 kernel
+  ////////////////////////////////////////////////////////////////////////////////
+  // Create a tuple of gemm kernel arguments. This is later passed as arguments to launch
+  // instantiated CUTLASS kernel
+  typename Symm_F64::Arguments arguments_f64{
+                                     cutlass::gemm::GemmUniversalMode::kGemm,
+                                     problem_size,                  // <- problem size of matrix multiplication
+                                     batch_count,                   // <- batch count
+                                     {double(options.alpha), double(options.alpha)},                 // <- tuple of alpha and beta
+                                     tensor_a_F64.device_data(),    // <- reference to matrix A on device
+                                     tensor_b_F64.device_data(),    // <- reference to matrix B on device
+                                     tensor_c_F64.device_data(),    // <- reference to matrix C on device
+                                     tensor_d_F64.device_data(),    // <- reference to matrix D on device
+                                     batch_stride_A,                // <- batch stride and ld for matrices
+                                     problem_size.m() * problem_size.n(),
+                                     problem_size.m() * problem_size.n(),
+                                     problem_size.m() * problem_size.n(),
+                                     tensor_a_F64.layout().stride(0),
+                                     tensor_b_F64.layout().stride(0),
+                                     tensor_c_F64.layout().stride(0),
+                                     tensor_d_F64.layout().stride(0)
+                                     };
+
+  // Using the arguments, query for extra workspace required for matrix multiplication computation
+  size_t workspace_size_f64 = Symm_F64::get_workspace_size(arguments_f64);
+
+  // Allocate workspace memory
+  cutlass::device_memory::allocation<uint8_t> workspace_f64(workspace_size_f64);
+
+  // Instantiate CUTLASS kernel depending on templates
+  Symm_F64 symm_op_f64;
+
+  // Check the problem size is supported or not 
+  cutlass::Status status_f64 = symm_op_f64.can_implement(arguments_f64);
+  CUTLASS_CHECK(status_f64);
+
+  // Initialize CUTLASS kernel with arguments and workspace pointer
+  status_f64 = symm_op_f64.initialize(arguments_f64, workspace_f64.get());
+  CUTLASS_CHECK(status_f64);
+
+  // Launch initialized CUTLASS kernel
+  status_f64 = symm_op_f64();
+  CUTLASS_CHECK(status_f64);
+
+  cudaDeviceSynchronize();
+
+  tensor_d_F64.sync_host();
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// 6. Run cuBLAS SSYMM kernel
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if CUTLASS_ENABLE_CUBLAS
+  cublasStatus_t cublas_status;
+  cublasHandle_t handle;
+
+  cublas_status = cublasCreate(&handle);
+  if (cublas_status != CUBLAS_STATUS_SUCCESS) {
+  std::cerr << "Failed to create cuBLAS handle." << std::endl;
+    return false;
+  }
+
+  cublas_status = cublasSsymm(
+      handle,
+      CUBLAS_SIDE_LEFT,
+      CUBLAS_FILL_MODE_LOWER,
+      problem_size.m(),
+      problem_size.n(),
+      static_cast<const float*>(&alpha),
+      static_cast<const float*>(tensor_a_F32.device_data()),
+      int(tensor_a_F32.layout().stride(0)),
+      static_cast<const float*>(tensor_b_F32.device_data()),
+      int(tensor_b_F32.layout().stride(0)),
+      static_cast<const float*>(&beta),
+      static_cast<float*>(tensor_d_cublasF32.device_data()),
+      int(tensor_d_cublasF32.layout().stride(0))
+    );   
+
+  cudaDeviceSynchronize();
+
+  tensor_d_cublasF32.sync_host();
+#endif
+
+  ////////////////////////////////////////////////////////////////////////////////
+  /// 7. Compute l2 norms 
+  ////////////////////////////////////////////////////////////////////////////////
+
+#if CUTLASS_ENABLE_CUBLAS
+  // l2 norm cuBLAS F32 vs F64
+  cutlass::HostTensor<double, LayoutOutput> tensor_d_cublasF32_in_F64(problem_size.mn());
+  cutlass::reference::host::TensorCopy(tensor_d_cublasF32_in_F64.host_view(), tensor_d_cublasF32.host_view());
+
+  double l2_norm_cublasf32_vs_f64 = cutlass::reference::host::TensorRelativeErrorMetric(
+    tensor_d_cublasF32_in_F64.host_view(), tensor_d_F64.host_view());
+#endif
+
+  // l2 norm 3xTF32 vs F64
+  cutlass::HostTensor<double, LayoutOutput> tensor_d_3xTF32_in_F64(problem_size.mn());
+  cutlass::reference::host::TensorCopy(tensor_d_3xTF32_in_F64.host_view(), tensor_d_3xTF32.host_view());
+  double l2_norm_3xtf32_vs_f64 = cutlass::reference::host::TensorRelativeErrorMetric(
+    tensor_d_3xTF32_in_F64.host_view(), tensor_d_F64.host_view());
+
+  // l2 norm 1xTF32 vs F64
+  cutlass::HostTensor<double, LayoutOutput> tensor_d_1xTF32_in_F64(problem_size.mn());
+  cutlass::reference::host::TensorCopy(tensor_d_1xTF32_in_F64.host_view(), tensor_d_1xTF32.host_view());
+  double l2_norm_1xtf32_vs_f64 = cutlass::reference::host::TensorRelativeErrorMetric(
+    tensor_d_1xTF32_in_F64.host_view(), tensor_d_F64.host_view());
+
+#if CUTLASS_ENABLE_CUBLAS
+  // l2 norm 3xTF32 vs cuBLAS F32
+  double l2_norm_3xtf32_vs_cublasf32 = cutlass::reference::host::TensorRelativeErrorMetric(
+    tensor_d_3xTF32.host_view(), tensor_d_cublasF32.host_view());
+#endif
+  
+  // l2 norm 3xTF32 vs 1xTF32
+  double l2_norm_3xtf32_vs_1xtf32 = cutlass::reference::host::TensorRelativeErrorMetric(
+    tensor_d_3xTF32.host_view(), tensor_d_1xTF32.host_view());
+
+  ///////////////////////////////////////////////////////////////////////////////
+
+  // Print kernel info and L2 norms 
+  std::cout << "Problem Size: (" << problem_size.m() << "," << problem_size.n() << "," << problem_size.k() << ") "
+            << "Alpha: "  << alpha << "," << " Beta: "  << beta << std::endl;
+  std::cout << std::fixed;
+  std::cout << "Normalized L2 norm of" << std::endl;
+  std::cout.precision(8);
+  std::cout << std::scientific 
+#if CUTLASS_ENABLE_CUBLAS
+            << " - cuBLAS F32 error with F64 reference    : " << l2_norm_cublasf32_vs_f64 << std::endl
+#endif
+            << " - 3xTF32 error with F64 reference        : " << l2_norm_3xtf32_vs_f64 << std::endl
+            << " - 1xTF32 error with F64 reference        : " << l2_norm_1xtf32_vs_f64 << std::endl
+#if CUTLASS_ENABLE_CUBLAS
+            << " - 3xTF32 error with cuBLAS F32 reference : " << l2_norm_3xtf32_vs_cublasf32 << std::endl
+#endif
+            << " - 3xTF32 error with 1xTF32 reference     : " << l2_norm_3xtf32_vs_1xtf32 << std::endl;
+
+  return true;
+}
+
+int main(int argc, const char **argv) {
+  
+  bool notSupported = false;
+
+  // Ampere Tensor Core operations exposed with mma.sync and ldmatrix are first available
+  // in CUDA 11.0. 
+  //
+  // CUTLASS must be compiled with CUDA 11.0 Toolkit to run these examples.
+  if (!(__CUDACC_VER_MAJOR__ >= 11)) {
+    std::cerr << "Ampere Tensor Core operations must be compiled with CUDA 11.0 Toolkit or later." << std::endl;
+    notSupported = true;
+  }
+
+  cudaDeviceProp props;
+
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return false;
+  }
+
+  if (!((props.major * 10 + props.minor) >= 80)) {
+    std::cerr << "Ampere Tensor Core operations must be run on a machine with compute capability at least 80."
+              << std::endl;
+    notSupported = true;
+  }
+
+  if (notSupported) {
+    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
+    return 0;
+  }
+
+  Options options;
+  options.parse(argc, argv);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  bool result = true;
+
+  if (!options.valid()) {
+    std::cerr << "Invalid problem." << std::endl;
+    return -1;
+  }
+
+  result = run(options);
+
+  if (!result) return -1;
+
+  return 0;
+}
diff --git a/examples/34_transposed_conv2d/34_transposed_conv2d.cu b/examples/34_transposed_conv2d/34_transposed_conv2d.cu
index 2fdace06..d9d878ad 100644
--- a/examples/34_transposed_conv2d/34_transposed_conv2d.cu
+++ b/examples/34_transposed_conv2d/34_transposed_conv2d.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2022, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -27,10 +33,8 @@
 This example shows how to compute 2d transposed convolution, also known as deconvolution, using CUTLASS
 conv2d Dgrad kernels. Although two operations are computationaly equivalent, some care is needed to correctly
 set up a problem size for CUTLASS.
-
 In deep learning, transposed convolution is sometimes used for upscaling feature maps. This example
 demonstrates the 2x upscaling case using the strided Dgrad kernel.
-
 */
 
 #include <iostream>
@@ -232,22 +236,22 @@ struct Options {
 	<< "  computationaly equivalent, some care is needed to correctly set up a problem size.\n\n"
 	<< "Options:\n\n"
 	<< "  --help               If specified, displays this usage statement.\n\n"
-	<< "  --n <int>            Input tensor extent N\n"
-	<< "  --h <int>            Input tensor extent H\n"
-	<< "  --w <int>            Input tensor extent W\n"
-	<< "  --c <int>            Input tensor extent C\n"
-	<< "  --k <int>            Filter extent K\n"
-	<< "  --r <int>            Filter extent R\n"
-	<< "  --s <int>            Filter extent S\n\n"
-	<< "  --alpha <float>      Epilogue scalar alpha\n"
-	<< "  --beta <float>       Epilogue scalar beta\n\n"
+	<< "  --n=<int>            Input tensor extent N\n"
+	<< "  --h=<int>            Input tensor extent H\n"
+	<< "  --w=<int>            Input tensor extent W\n"
+	<< "  --c=<int>            Input tensor extent C\n"
+	<< "  --k=<int>            Filter extent K\n"
+	<< "  --r=<int>            Filter extent R\n"
+	<< "  --s=<int>            Filter extent S\n\n"
+	<< "  --alpha=<float>      Epilogue scalar alpha\n"
+	<< "  --beta=<float>       Epilogue scalar beta\n\n"
 	<< "  --skip-ref-check     If set (true), skip reference check on the host\n"
 	<< "  --perf-check         If set (true), performance is measured.\n"
-	<< "  --iterations <int>   Number of profiling iterations to perform.\n"
-	<< "  --tag <string>       String to replicate across the first column in the results table\n";
+	<< "  --iterations=<int>   Number of profiling iterations to perform.\n"
+	<< "  --tag=<string>       String to replicate across the first column in the results table\n";
 
     out << "\n\nExamples:\n\n"
-	<< "$ ./examples/34_transposed_conv2d/34_transposed_conv2d --n=8 --h=32 --w=32 --c=16 --k=32 --r=3 --s=3\n\n";
+	<< "$ ./examples/31_transposed_conv2d/31_transposed_conv2d --n=8 --h=32 --w=32 --c=16 --k=32 --r=3 --s=3\n\n";
 
     return out;
   }
@@ -327,7 +331,6 @@ struct Result {
   }
 };
 
-
 // This is the same as Conv2dDgrad in tools/util/include/cutlass/util/reference/host/convolution.h,
 // only variable names have been adapted for transposed conv2d.
 void Conv2dTransposeReference(
diff --git a/examples/34_transposed_conv2d/CMakeLists.txt b/examples/34_transposed_conv2d/CMakeLists.txt
index d14c9e24..00a0dbce 100644
--- a/examples/34_transposed_conv2d/CMakeLists.txt
+++ b/examples/34_transposed_conv2d/CMakeLists.txt
@@ -1,23 +1,30 @@
-# Copyright (c) 2017-2022, NVIDIA CORPORATION.  All rights reserved.
+
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
diff --git a/examples/35_gemm_softmax/CMakeLists.txt b/examples/35_gemm_softmax/CMakeLists.txt
new file mode 100644
index 00000000..51611290
--- /dev/null
+++ b/examples/35_gemm_softmax/CMakeLists.txt
@@ -0,0 +1,36 @@
+
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+
+cutlass_example_add_executable(
+  35_gemm_softmax
+  gemm_softmax.cu
+  )
+
diff --git a/examples/35_gemm_softmax/epilogue_with_visitor.h b/examples/35_gemm_softmax/epilogue_with_visitor.h
new file mode 100644
index 00000000..aa322553
--- /dev/null
+++ b/examples/35_gemm_softmax/epilogue_with_visitor.h
@@ -0,0 +1,407 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Generic epilogue for implementing certain kinds of fused epilogue behavior.
+*/
+
+#pragma once
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/semaphore.h"
+#include "cutlass/epilogue/threadblock/epilogue_base.h"
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+class EpilogueFusedVisitorConcept {
+public:
+
+  static int const kIterations = 1;
+  static int const kElementsPerAccess = 4;
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+  using AccumulatorFragment = Array<ElementAccumulator, kElementsPerAccess>;
+
+  /// Arguments structure
+  struct Arguments {  };
+
+  /// Params structure
+  struct Params {
+
+    Params() { }
+    Params(Arguments const &args) { }
+  };
+
+  /// Shared storage
+  struct SharedStorage { };
+
+public:
+
+  CUTLASS_DEVICE
+  EpilogueFusedVisitorConcept(
+    Params const &params,                                         ///< Parameters routed to the epilogue
+    SharedStorage &shared_storage,                                ///< Shared storage needed by the functors here
+    MatrixCoord const &problem_size,                              ///< Problem size of the output
+    int thread_idx,                                               ///< Thread index within the threadblock
+    int warp_idx,                                                 ///< Warp index within the threadblock
+    int lane_idx,                                                 ///< Lane index within the warp
+    MatrixCoord const &threadblock_offset = MatrixCoord(0, 0)) {  ///< Coordinate
+
+  }
+
+  /// Helper to indicate split-K behavior
+  CUTLASS_DEVICE
+  void set_k_partition(
+    int split_k_index,                                            ///< Index of this threadblock within split-K partitioned scheme
+    int split_k_slices) {                                         ///< Total number of split-K slices
+
+  }
+
+  /// Called to set the batch index
+  CUTLASS_DEVICE
+  void set_batch_index(int batch_idx) {
+
+  }
+
+  /// Called at the start of the epilogue just before iterating over accumulator slices
+  CUTLASS_DEVICE
+  void begin_epilogue() {
+
+  }
+
+  /// Called at the start of one step before starting accumulator exchange
+  CUTLASS_DEVICE
+  void begin_step(int step_idx) {
+
+  }
+
+  /// Called at the start of a row
+  CUTLASS_DEVICE
+  void begin_row(int row_idx) {
+
+  }
+
+  /// Called after accumulators have been exchanged for each accumulator vector
+  CUTLASS_DEVICE
+  void visit(
+    int row_idx,
+    int column_idx,
+    int frag_idx,
+    AccumulatorFragment const &accum) {
+
+  }
+
+  /// Called at the start of a row
+  CUTLASS_DEVICE
+  void end_row(int row_idx) {
+
+  }
+
+  /// Called after all accumulator elements have been visited
+  CUTLASS_DEVICE
+  void end_step(int step_idx) {
+
+  }
+
+  /// Called after all steps have been completed
+  CUTLASS_DEVICE
+  void end_epilogue() {
+
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Epilogue operator
+template <
+  typename Visitor_,                        ///< Functor containing fused operations (satisfies EpilogueFusedVisitorConcept)
+  typename Shape_,                          ///< Shape of threadblock tile (concept: GemmShape)
+  typename WarpMmaOperator_,                ///< Warp-level MMA operator (concept: gemm::warp::MmaTensorOp)
+  int PartitionsK,                          ///< Number of partitions of the K dimension
+  typename AccumulatorFragmentIterator_,    ///< Fragment iterator selecting accumulators
+  typename WarpTileIterator_,               ///< Warp-scoped tile iterator writing accumulators to SMEM
+  typename SharedLoadIterator_,             ///< Threadblock-scoped tile iterator loading from SMEM
+  typename Padding_,                        ///< Padding added to SMEM allocation to avoid bank conflicts (concept: MatrixShape)
+  int FragmentsPerPartition = 1,            ///< Used to coarsten the epilogue granularity
+  int IterationsUnroll =                    ///< Used to reduce binary size when epilogue op is large
+    (true || !IsEpilogueFunctorHeavy<Visitor_>::value)
+>
+class EpilogueWithVisitor :
+  public EpilogueBase<
+    Shape_,
+    typename WarpMmaOperator_::Shape,
+    PartitionsK,
+    AccumulatorFragmentIterator_,
+    WarpTileIterator_,
+    Padding_,
+    FragmentsPerPartition> {
+
+public:
+
+  using Visitor = Visitor_;
+
+  using Base = EpilogueBase<
+    Shape_,
+    typename WarpMmaOperator_::Shape,
+    PartitionsK,
+    AccumulatorFragmentIterator_,
+    WarpTileIterator_,
+    Padding_,
+    FragmentsPerPartition>;
+
+  using Shape = Shape_;
+  using WarpMmaOperator = WarpMmaOperator_;
+  static int const kPartitionsK = PartitionsK;
+
+  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
+  using WarpTileIterator = WarpTileIterator_;
+  using SharedLoadIterator = SharedLoadIterator_;
+  using Padding = Padding_;
+
+  using Layout = layout::RowMajor;
+  using LongIndex = typename Layout::LongIndex;
+
+  /// The complete warp-level accumulator tile
+  using AccumulatorTile = typename Base::AccumulatorTile;
+
+  /// Accumulator element
+  using ElementAccumulator = typename WarpTileIterator::Element;
+
+  /// Output access size
+  static int const kElementsPerAccess = Visitor::kElementsPerAccess;
+
+  /// Tensor reference to sync tensor
+  using SyncTensorRef = typename cutlass::TensorRef<int, cutlass::layout::PackedVectorLayout>;
+
+  /// Array type used by output functor
+  using AccumulatorAccessType = Array<
+    typename WarpTileIterator::Element, kElementsPerAccess>;
+
+  /// Number of warps
+  using WarpCount = typename Base::WarpCount;
+
+  static int constexpr kSmemTiles = Base::kFragmentsPerIteration > 1 ? Base::kFragmentsPerIteration : kPartitionsK;
+  static int constexpr kSmemPointerOffset = Base::SharedStorage::StorageShape::kCount / kSmemTiles;
+
+  using SharedStorage = typename Base::SharedStorage;
+
+private:
+
+  /// Loads fragment from shared memory aligned with output tensor
+  SharedLoadIterator shared_load_iterator_;
+
+public:
+
+  /// Constructor
+  CUTLASS_DEVICE
+  EpilogueWithVisitor(
+    SharedStorage &shared_storage,    ///< Shared storage object
+    int thread_idx,                   ///< ID of a thread within the threadblock
+    int warp_idx,                     ///< ID of warp within threadblock
+    int lane_idx                      ///< Id of thread within warp
+  ):
+    Base(shared_storage, thread_idx, warp_idx, lane_idx),
+    shared_load_iterator_(shared_storage.reference(), thread_idx)
+  {
+
+  }
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void operator()(
+    Visitor & visitor,
+    AccumulatorTile const &accumulators) {         ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+
+    visitor.begin_epilogue();
+
+    //
+    // Iterator over warp-level accumulator fragment
+    //
+
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+
+    //
+    // Iterate over accumulator tile
+    //
+
+    #pragma unroll(IterationsUnroll ? Visitor::kIterations : 1)
+    for (int iter_idx = 0; iter_idx < Visitor::kIterations; ++iter_idx) {
+
+      //
+      // Load the source
+      //
+
+      visitor.begin_step(iter_idx);
+
+      //
+      // Convert and store fragment
+      //
+
+      __syncthreads();
+
+      acc2smem_source_needed<cutlass::make_index_sequence<Visitor::kIterations>>::push(
+          iter_idx, accum_fragment_iterator, this->warp_tile_iterator_);
+
+      __syncthreads();
+
+      //
+      // Load fragments from shared memory
+      //
+
+      typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
+
+      shared_load_iterator_.load(aligned_accum_fragment[0]);
+
+      // If the number of k-slices is > 1 - perform a reduction amongst the k-slices
+      if (kPartitionsK > 1) {
+
+        plus <typename SharedLoadIterator::Fragment> add_fragments;
+
+        CUTLASS_PRAGMA_UNROLL
+        for ( int i = 1; i < kPartitionsK; ++i) {
+          shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+          shared_load_iterator_.load(aligned_accum_fragment[i]);
+          aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
+        }
+
+        shared_load_iterator_.add_pointer_offset((1 - kPartitionsK) * kSmemPointerOffset);
+      }
+
+      //
+      // Iterate over output fragments
+      //
+
+      AccumulatorAccessType const *accum_frag_ptr =
+        reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment[0]);
+
+      int const kAccumulatorFragmentCount = AccumulatorTile::kElements / (Visitor::kIterations * AccumulatorAccessType::kElements);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int idx = 0; idx < kAccumulatorFragmentCount; ++idx) {
+
+        int row_idx = idx / SharedLoadIterator::ThreadMap::Iterations::kColumn;
+        int col_idx = idx % SharedLoadIterator::ThreadMap::Iterations::kColumn;
+
+        // Start a new row of the output fragment
+        if (!col_idx) {
+          visitor.begin_row(row_idx);
+        }
+
+        visitor.visit(
+          row_idx,
+          col_idx,
+          idx,
+          accum_frag_ptr[idx]
+        );
+
+        // End the row of the output fragment
+        if (col_idx + 1 == SharedLoadIterator::ThreadMap::Iterations::kColumn) {
+          visitor.end_row(row_idx);
+        }
+      }
+
+      //
+      // Conclude the step
+      //
+
+      visitor.end_step(iter_idx);
+    }
+
+    visitor.end_epilogue();
+  }
+
+private:
+
+
+  template<class Seq>
+  struct acc2smem_source_needed;
+
+  template <size_t... Seq>
+  struct acc2smem_source_needed<cutlass::index_sequence<Seq...>> {
+    template<int Advance>
+    CUTLASS_DEVICE
+    static void helper(AccumulatorFragmentIterator accum_fragment_iterator,
+                       WarpTileIterator &warp_tile_iterator) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Advance; i++) {
+        ++accum_fragment_iterator;
+      }
+
+      typename AccumulatorFragmentIterator::Fragment accum_fragment;
+      accum_fragment_iterator.load(accum_fragment);
+      warp_tile_iterator.store(accum_fragment);
+    }
+
+    CUTLASS_DEVICE
+    static void push(size_t pos,
+                     AccumulatorFragmentIterator const &iterator_begin,
+                     WarpTileIterator &warp_tile_iterator) {
+      int dummy[] = {(pos == Seq) && (helper<Seq>(iterator_begin, warp_tile_iterator), 0)...};
+    }
+  };
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Helper to create an EpilogueWithVisitor from an existing epilogue
+template <typename Visitor_, typename Existing_, bool IterationsUnroll = true>
+struct EpilogueWithVisitorFromExistingEpilogue  {
+
+  using Epilogue = EpilogueWithVisitor<
+    Visitor_,
+    typename Existing_::Shape,
+    typename Existing_::WarpMmaOperator,
+    Existing_::kPartitionsK,
+    typename Existing_::AccumulatorFragmentIterator,
+    typename Existing_::WarpTileIterator,
+    typename Existing_::SharedLoadIterator,
+    typename Existing_::Padding,
+    Existing_::kFragmentsPerIteration,
+    IterationsUnroll
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/35_gemm_softmax/gemm_softmax.cu b/examples/35_gemm_softmax/gemm_softmax.cu
new file mode 100644
index 00000000..7c32232d
--- /dev/null
+++ b/examples/35_gemm_softmax/gemm_softmax.cu
@@ -0,0 +1,681 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/**
+
+*/
+
+#include <cmath>
+#include <iostream>
+#include <vector>
+#include <limits>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/gemm/device/gemm_complex.h"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/host_tensor.h"
+
+#include "cutlass/util/reference/host/gemm_complex.h"
+#include "cutlass/util/reference/host/tensor_reduce.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/error_metrics.h"
+#include "cutlass/util/tensor_view_io.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "gemm_with_softmax.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#define TRACE(x) { std::cout << "gemm_softmax.cu:" << __LINE__ << "  " << x << std::endl; }
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+enum class Disposition {
+  kPassed,
+  kIncorrect,
+  kNotVerified
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+
+  bool help;
+  cutlass::gemm::GemmCoord problem_size;
+  int batch_count;
+  int iterations;
+  unsigned seed;
+  float alpha;
+  float beta;
+  bool verification_enabled;
+  double tolerance;
+
+  Options():
+    help(false),
+    problem_size({16, 24, 64}),
+    batch_count(1),             // As a temporary limitation to the test bench, batch count must be 1. The kernels support arbitrary batching.
+    iterations(20),
+    seed(2022),
+    alpha(1),
+    beta(),
+    verification_enabled(true),
+    tolerance(0.01)
+  { }
+
+  bool valid() {
+
+    return true;
+  }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+    }
+
+    cmd.get_cmd_line_argument("m", problem_size.m());
+    cmd.get_cmd_line_argument("n", problem_size.n());
+    cmd.get_cmd_line_argument("k", problem_size.k());
+
+    cmd.get_cmd_line_argument("alpha", alpha);
+    cmd.get_cmd_line_argument("beta", beta);
+
+    cmd.get_cmd_line_argument("iterations", iterations);
+    cmd.get_cmd_line_argument("verify", verification_enabled);
+    cmd.get_cmd_line_argument("seed", seed);
+    cmd.get_cmd_line_argument("tolerance", tolerance);
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "35_gemm_softmax example\n\n"
+      << "  This example uses the CUTLASS Library to compute GEMM + Softmax for arbitrary problem sizes.\n\n"
+      << "Options:\n\n"
+      << "  --help                      If specified, displays this usage statement.\n\n"
+      << "  --m=<int>                   GEMM M dimension\n"
+      << "  --n=<int>                   GEMM N dimension\n"
+      << "  --k=<int>                   GEMM K dimension\n"
+      << "  --alpha=<f32>               Epilogue scalar alpha\n"
+      << "  --beta=<f32>                Epilogue scalar beta\n\n"
+      << "  --seed=<int>                Random number seed (1*)\n\n"
+      << "  --iterations=<int>          Number of profiling iterations to perform (0 to disable profiling).\n\n"
+      << "  --verify=<bool>             If true, performs reference calculation.\n\n"
+      << "  --tolerance <float>         Error tolerance\n"
+    ;
+
+    out << "\n\nExamples:\n\n"
+      << "$ ./examples/35_gemm_softmax/35_gemm_softmax --m=1024 --n=512 \\\n"
+      << "     --alpha=2 --beta=0.707 \n\n";
+
+    return out;
+  }
+
+  /// Returns true if the environment and Toolkit support this
+  bool supported(bool verbose = true) const {
+
+    // Ampere Tensor Core operations exposed with mma.sync and ldmatrix are first available
+    // in CUDA 11.0.
+    //
+    // CUTLASS must be compiled with CUDA 11.0 Toolkit to run these examples.
+    if (!(__CUDACC_VER_MAJOR__ >= 11)) {
+      if (verbose) {
+        std::cerr << "Ampere Tensor Core operations must be compiled with CUDA 11.0 Toolkit or later." << std::endl;
+      }
+      return false;
+    }
+
+    cudaDeviceProp props;
+
+    cudaError_t error = cudaGetDeviceProperties(&props, 0);
+    if (error != cudaSuccess) {
+      if (verbose) {
+        std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+      }
+      return false;
+    }
+
+    if (!((props.major * 10 + props.minor) >= 80)) {
+      if (verbose) {
+        std::cerr << "Ampere Tensor Core operations must be run on a machine with compute capability at least 80."
+                  << std::endl;
+      }
+      return false;
+    }
+
+    return true;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Testbed {
+
+  //
+  // Type definitions
+  //
+
+
+  using ElementA = cutlass::half_t;
+  using ElementB = cutlass::half_t;
+  using ElementC = cutlass::half_t;
+  using ElementD = cutlass::half_t;
+  using ElementCompute = float;
+  using ElementSoftmax = cutlass::half_t;
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+
+  using GemmSoftmax = cutlass::GemmSoftmax<
+    ElementA, LayoutA,
+    ElementB, LayoutB,
+    ElementC,
+    ElementCompute
+  >;
+
+  using ElementN = typename GemmSoftmax::ElementN;
+  using LayoutC = typename GemmSoftmax::LayoutC;
+
+  //
+  // Data members
+  //
+
+  Options const &options;
+
+  cutlass::HostTensor<ElementA, LayoutA>        tensor_A;
+  cutlass::HostTensor<ElementB, LayoutB>        tensor_B;
+  cutlass::HostTensor<ElementC, LayoutC>        tensor_C;
+  cutlass::HostTensor<ElementD, LayoutC>        tensor_D;
+  cutlass::HostTensor<ElementN, LayoutC>        tensor_N;
+  cutlass::HostTensor<ElementSoftmax, LayoutC>  tensor_Softmax;
+
+  cutlass::HostTensor<ElementD, LayoutC>        reference_D;
+  cutlass::HostTensor<ElementN, LayoutC>        reference_N;
+  cutlass::HostTensor<ElementSoftmax, LayoutC>  reference_Softmax;
+
+  //
+  // Methods
+  //
+
+  Testbed(
+    Options const &options_
+  ):
+    options(options_)
+  {
+
+    tensor_A.reset({options.problem_size.m(), options.problem_size.k()});
+    tensor_B.reset({options.problem_size.k(), options.problem_size.n()});
+
+    tensor_C.reset({options.problem_size.m(), options.problem_size.n()});
+    tensor_D.reset({options.problem_size.m(), options.problem_size.n()});
+
+    tensor_N.reset({options.problem_size.m(), 1});
+    tensor_Softmax.reset({options.problem_size.m(), options.problem_size.n()});
+
+    reference_D.reset({options.problem_size.m(), options.problem_size.n()}, false);
+    reference_N.reset({options.problem_size.m(), 1}, false);
+    reference_Softmax.reset({options.problem_size.m(), options.problem_size.n()}, false);
+  }
+
+  /// Run
+  Disposition run() {
+
+    Disposition disposition = Disposition::kNotVerified;
+
+    //
+    // Initialize the workspace
+    //
+
+    initialize();
+
+    //
+    // Launch device kernel
+    //
+    cutlass::Status status = cutlass::Status::kSuccess;
+
+    status = execute_device_kernel();
+
+    if (status != cutlass::Status::kSuccess) {
+      std::cerr << "Device execution failed." << std::endl;
+      return disposition;
+    }
+
+    cudaError_t result = cudaDeviceSynchronize();
+    if (result != cudaSuccess) {
+      std::cerr << "Device synchronize failed with error "
+        << cudaGetErrorString(result) << std::endl;
+      return disposition;
+    }
+
+    //
+    // Compute the reference
+    //
+    compute_reference();
+
+    //
+    // Verify
+    //
+
+    if (options.verification_enabled) {
+
+      bool passed = verify();
+
+      if (passed) {
+        disposition = Disposition::kPassed;
+      }
+      else {
+        disposition = Disposition::kIncorrect;
+      }
+    }
+
+    //
+    // Profiling
+    //
+    if (options.iterations) {
+      profile();
+    }
+
+    return disposition;
+  }
+
+  /// Random initialization
+  void initialize() {
+
+    cutlass::reference::host::TensorFillRandomUniform(
+      tensor_A.host_view(),
+        options.seed,
+        ElementD(5),
+        ElementD(-5),
+        0
+      );
+
+    cutlass::reference::host::TensorFillRandomUniform(
+      tensor_B.host_view(),
+        options.seed + 19,
+        ElementD(5),
+        ElementD(-5),
+        0
+      );
+
+    cutlass::reference::host::TensorFill(
+      reference_D.host_view(),
+      ElementD()
+      );
+
+    cutlass::reference::host::TensorFill(
+      reference_N.host_view(),
+      ElementN()
+    );
+
+    cutlass::reference::host::TensorFill(
+      reference_Softmax.host_view(),
+      ElementSoftmax()
+    );
+
+    tensor_A.sync_device();
+    tensor_B.sync_device();
+    tensor_D.sync_device();
+    tensor_N.sync_device();
+    tensor_Softmax.sync_device();
+  }
+
+  cutlass::Status execute_device_kernel() {
+
+    cutlass::Status status = cutlass::Status::kSuccess;
+
+    //
+    // Setup arguments
+    //
+
+    GemmSoftmax::Arguments args(
+      options.problem_size,
+      options.batch_count,
+      tensor_A.device_ref(),
+      tensor_B.device_ref(),
+      tensor_C.device_ref(),
+      tensor_D.device_ref(),
+      {
+        ElementCompute(options.alpha),
+        ElementCompute(options.beta)
+      },
+      tensor_N.device_ref(),
+      tensor_Softmax.device_ref()
+    );
+
+    //
+    // Launch
+    //
+
+    GemmSoftmax gemm_softmax;
+
+    // Initialize
+    status = gemm_softmax.initialize(args);
+    if (status != cutlass::Status::kSuccess) {
+      return status;
+    }
+
+    // Run
+    status = gemm_softmax();
+
+    return status;
+  }
+
+  /// Reference calculation
+  void compute_reference() {
+
+    // Compute GEMM
+
+    cutlass::reference::host::GemmComplex(
+      options.problem_size,
+      options.alpha,
+      tensor_A.host_ref(),
+      cutlass::ComplexTransform::kNone,
+      tensor_B.host_ref(),
+      cutlass::ComplexTransform::kNone,
+      options.beta,
+      tensor_C.host_ref(),
+      reference_D.host_ref(),
+      double()
+    );
+
+    // Compute the norm
+    for (int m = 0; m < options.problem_size.m(); ++m) {
+      reference_N.at({m, 0}) = reference_D.at({m, 0});
+      for (int n = 1; n < options.problem_size.n(); ++n) {
+        reference_N.at({m, 0}) = std::max(reference_N.at({m, 0}), ElementN(reference_D.at({m, n})));
+      }
+    }
+
+    // Compute softmax
+    for (int m = 0; m < options.problem_size.m(); ++m) {
+
+      float sum = float();
+
+      for (int n = 0; n < options.problem_size.n(); ++n) {
+        sum += std::exp( float(reference_D.at({m, n})) - float(reference_N.at({m, 0})) );
+      }
+
+      float inv_sum = float(1.0f / sum);
+
+      for (int n = 0; n < options.problem_size.n(); ++n) {
+
+        reference_Softmax.at({m, n}) = ElementSoftmax(
+          std::exp( float(reference_D.at({m, n})) - float(reference_N.at({m, 0})) ) * inv_sum
+        );
+      }
+    }
+  }
+
+  /// Emits all tensor values
+  void emit_results() {
+    std::cout << "D = \n" << tensor_D.host_view() << "\n\n";
+    std::cout << "N = \n" << tensor_N.host_view() << "\n\n";
+    std::cout << "Softmax = \n" << tensor_Softmax.host_view() << "\n\n";
+    std::cout << "Reference N = \n" << reference_N.host_view() << "\n\n";
+    std::cout << "Reference D = \n" << reference_D.host_view() << "\n\n";
+    std::cout << "Reference Softmax = \n" << reference_Softmax.host_view() << "\n\n";
+  }
+
+  /// Verifies the reference matches
+  bool verify() {
+
+    tensor_D.sync_host();
+    tensor_N.sync_host();
+    tensor_Softmax.sync_host();
+
+    double const kThreshold = options.tolerance;
+
+    // Verification checks - set any of these to 'true' to override the verification checks.
+    bool verified_D = false;
+    bool verified_N = false;
+    bool verified_Softmax = false;
+
+    // Verify softmax output
+    if (!verified_D) {
+
+      double norm_diff = cutlass::reference::host::TensorNormDiff(
+        tensor_D.host_view(),
+        reference_D.host_view());
+
+      double norm_reference = cutlass::reference::host::TensorNorm(
+        reference_D.host_view());
+
+      double rel_error = norm_diff / norm_reference;
+
+      if (rel_error > kThreshold) {
+        std::cerr << "\n\nTensor D Relative error: " << rel_error << std::endl;
+      }
+      else {
+        verified_D = true;
+      }
+    }
+
+    if (!verified_N) {
+
+      double norm_diff = cutlass::reference::host::TensorNormDiff(
+        tensor_N.host_view(),
+        reference_N.host_view());
+
+      double norm_reference = cutlass::reference::host::TensorNorm(
+        reference_N.host_view());
+
+      double rel_error = norm_diff / norm_reference;
+
+      if (rel_error > kThreshold) {
+        std::cerr << "\n\nTensor N Relative error: " << rel_error << std::endl;
+      }
+      else {
+        verified_N = true;
+      }
+    }
+
+    if (!verified_Softmax) {
+
+      double norm_diff = cutlass::reference::host::TensorNormDiff(
+        tensor_Softmax.host_view(),
+        reference_Softmax.host_view());
+
+      double norm_reference = cutlass::reference::host::TensorNorm(
+        reference_Softmax.host_view());
+
+      double rel_error = norm_diff / norm_reference;
+
+      if (rel_error > kThreshold) {
+        std::cerr << "\n\nSoftmax Relative error: " << rel_error << std::endl;
+      }
+      else {
+        verified_Softmax = true;
+      }
+    }
+
+    if (!verified_D || !verified_N || !verified_Softmax) {
+
+      std::cerr << "Verification check failed for tensor Softmax" << std::endl;
+
+      emit_results();
+
+      // Summarize which checks failed
+      if (!verified_D) {
+        std::cerr << "Verification of D tensor failed\n";
+      }
+
+      if (!verified_N) {
+        std::cerr << "Verification of N tensor failed\n";
+      }
+
+      if (!verified_Softmax) {
+        std::cerr << "Verification of Softmax tensor failed\n";
+      }
+
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Profiles
+  bool profile() {
+
+    //
+    // Profile
+    //
+
+    cutlass::Status status = cutlass::Status::kSuccess;
+    cudaError_t result;
+    cudaEvent_t events[2];
+    int const kIterations = options.iterations;
+
+    for (cudaEvent_t &evt : events) {
+      result = cudaEventCreate(&evt);
+      if (result != cudaSuccess) {
+        std::cerr << "cudaEventCreate failed with error " << cudaGetErrorString(result) << std::endl;
+        return false;
+      }
+    }
+
+    result = cudaEventRecord(events[0]);
+
+    if (result != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed with error " << cudaGetErrorString(result) << std::endl;
+      return false;
+    }
+
+    for (int iter = 0; iter < kIterations; ++iter) {
+
+      status = execute_device_kernel();
+
+      if (status != cutlass::Status::kSuccess) {
+        std::cerr << "Device execution failed." << std::endl;
+        return false;
+      }
+    }
+
+    result = cudaEventRecord(events[1]);
+
+    if (result != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed with error " << cudaGetErrorString(result) << std::endl;
+      return false;
+    }
+
+    result = cudaDeviceSynchronize();
+
+    if (result != cudaSuccess) {
+      std::cerr << "cudaDeviceSynchronize() failed with error " << cudaGetErrorString(result) << std::endl;
+      return false;
+    }
+
+    float elapsed_ms = 0;
+    result = cudaEventElapsedTime(&elapsed_ms, events[0], events[1]);
+
+    if (result != cudaSuccess) {
+      std::cerr << "cudaEventElapsedTime() failed with error " << cudaGetErrorString(result) << std::endl;
+      return false;
+    }
+
+    for (cudaEvent_t &evt : events) {
+      result = cudaEventDestroy(evt);
+      if (result != cudaSuccess) {
+        std::cerr << "cudaEventDestroy() failed with error " << cudaGetErrorString(result) << std::endl;
+        return false;
+      }
+    }
+
+    int64_t flops = int64_t(options.problem_size.m()) * options.problem_size.n() * options.problem_size.k() * 2;
+    int64_t bytes = (sizeof(ElementD) * 2 + sizeof(ElementSoftmax)) * options.problem_size.m() * options.problem_size.n();
+
+    double gflops_per_second = double(flops) * kIterations / double(elapsed_ms / 1000.0f) / double(1.0e9);
+    double gbytes_per_second = double(bytes) * kIterations / double(elapsed_ms / 1000.0f) / double(1 << 30);
+
+    std::cout << "         Problem: "
+              << options.problem_size.m() << "-by-" << options.problem_size.n() << "-by-" << options.problem_size.k()
+              << std::endl;
+
+    std::cout << "         Runtime: " << elapsed_ms << " ms\n" << std::endl;
+
+    std::cout << "          GFLOPs: " << gflops_per_second << "  GFLOPs" << std::endl;
+    std::cout << "Memory bandwidth: " << gbytes_per_second << "  GiB/s" << std::endl;
+
+    return true;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, const char **argv) {
+
+  // Options parsing
+  Options options;
+  options.parse(argc, argv);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  if (!options.supported()) {
+    return 0;
+  }
+
+  // Run
+  Testbed testbed(options);
+
+  Disposition disposition = testbed.run();
+
+  std::cout << std::endl;
+
+  switch (disposition) {
+    case Disposition::kPassed:
+      std::cout << "Passed" << std::endl;
+      break;
+    case Disposition::kIncorrect:
+      std::cout << "Incorrect" << std::endl;
+      break;
+    case Disposition::kNotVerified:
+      std::cout << "Not verified" << std::endl;
+      break;
+  }
+
+  return (disposition == Disposition::kPassed ? 0 : -1);
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/examples/35_gemm_softmax/gemm_with_epilogue_visitor.h b/examples/35_gemm_softmax/gemm_with_epilogue_visitor.h
new file mode 100644
index 00000000..814de5ae
--- /dev/null
+++ b/examples/35_gemm_softmax/gemm_with_epilogue_visitor.h
@@ -0,0 +1,498 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief GEMM kernel to support the 'epilogue visitor' model for fusion.
+
+    This source file will likely be moved to `include/cutlass/gemm/kernel/` in the future once
+    its usage has been stabilized. For now, it is included in this example to demonstrate
+    some basic output fusion options.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/semaphore.h"
+
+#include "cutlass/trace.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_    ///! Threadblock swizzling function
+>
+struct GemmWithEpilogueVisitor {
+public:
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueVisitor = typename Epilogue::Visitor;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using TensorRefA = TensorRef<ElementA, LayoutA>;
+
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using TensorRefB = TensorRef<ElementB, LayoutB>;
+
+  using ElementC = typename EpilogueVisitor::ElementOutput;
+  using LayoutC = typename Epilogue::Layout;
+
+  static ComplexTransform const kTransformA = Mma::kTransformA;
+  static ComplexTransform const kTransformB = Mma::kTransformB;
+  using Operator = typename Mma::Operator;
+
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma::ArchTag;
+
+  static int const kStages = Mma::kStages;
+  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+  static int const kAlignmentC = EpilogueVisitor::kElementsPerAccess;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Split-K preserves splits that are 128b aligned
+  static int const kSplitKAlignment = const_max(
+    128 / sizeof_bits<ElementA>::value,
+    128 / sizeof_bits<ElementB>::value
+  );
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmUniversalMode mode;
+    GemmCoord problem_size;
+    int batch_count;
+
+    TensorRefA ref_A;
+    TensorRefB ref_B;
+
+    int64_t    batch_stride_A;
+    int64_t    batch_stride_B;
+
+    typename EpilogueVisitor::Arguments epilogue_visitor;
+
+    //
+    // Methods
+    //
+
+    Arguments():
+      mode(GemmUniversalMode::kGemm),
+      batch_count(1)
+    { }
+
+
+    /// constructs an arguments structure
+    Arguments(
+      GemmUniversalMode mode_,
+      GemmCoord problem_size_,
+      int batch_count_,
+      TensorRefA ref_A_,
+      TensorRefB ref_B_,
+      int64_t batch_stride_A_,
+      int64_t batch_stride_B_,
+      typename EpilogueVisitor::Arguments epilogue_visitor_
+    ):
+      mode(mode_),
+      problem_size(problem_size_),
+      batch_count(batch_count_),
+      ref_A(ref_A_),
+      ref_B(ref_B_),
+      batch_stride_A(batch_stride_A_),
+      batch_stride_B(batch_stride_B_),
+      epilogue_visitor(epilogue_visitor_)
+    {
+
+    }
+  };
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params {
+
+    cutlass::gemm::GemmCoord problem_size;
+    cutlass::gemm::GemmCoord grid_tiled_shape;
+    int swizzle_log_tile;
+
+    typename Mma::IteratorA::Params params_A;
+    typename Mma::IteratorB::Params params_B;
+
+    GemmUniversalMode mode;
+    int batch_count;
+    int gemm_k_size;
+
+    void * ptr_A;
+    void * ptr_B;
+
+    int64_t batch_stride_A;
+    int64_t batch_stride_B;
+
+    typename EpilogueVisitor::Params epilogue_visitor;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params():
+      swizzle_log_tile(0),
+      params_A(0),
+      params_B(0),
+      batch_count(0),
+      gemm_k_size(0),
+      mode(cutlass::gemm::GemmUniversalMode::kGemm),
+      ptr_A(nullptr),
+      ptr_B(nullptr),
+      batch_stride_A(0),
+      batch_stride_B(0)
+    { }
+
+
+    Params(
+      Arguments const &args
+    ):
+      problem_size(args.problem_size),
+      swizzle_log_tile(0),
+      params_A(args.ref_A.layout()),
+      params_B(args.ref_B.layout()),
+      mode(args.mode),
+      batch_count(args.batch_count),
+      gemm_k_size(args.problem_size.k()),
+      ptr_A(args.ref_A.data()),
+      ptr_B(args.ref_B.data()),
+      batch_stride_A(args.batch_stride_A),
+      batch_stride_B(args.batch_stride_B),
+      epilogue_visitor(args.epilogue_visitor)
+    {
+
+      ThreadblockSwizzle threadblock_swizzle;
+
+      grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
+        args.problem_size,
+        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+        args.batch_count);
+
+      if (args.mode == GemmUniversalMode::kGemm || args.mode == GemmUniversalMode::kGemmSplitKParallel) {
+
+        int const kAlignK = const_max(const_max(128 / sizeof_bits<ElementA>::value, 128 / sizeof_bits<ElementB>::value), 1);
+
+        gemm_k_size = round_up(ceil_div(args.problem_size.k(), args.batch_count), kAlignK);
+
+        if (gemm_k_size) {
+          grid_tiled_shape.k() = ceil_div(args.problem_size.k(), gemm_k_size);
+        }
+      }
+
+      swizzle_log_tile = threadblock_swizzle.get_log_tile(grid_tiled_shape);
+    }
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+
+    typename Mma::SharedStorage main_loop;
+
+    struct {
+      typename Epilogue::SharedStorage epilogue;
+      typename EpilogueVisitor::SharedStorage visitor;
+    } epilogue;
+  };
+
+public:
+
+  //
+  // Methods
+  //
+
+  CUTLASS_DEVICE
+  GemmWithEpilogueVisitor() { }
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(
+    cutlass::gemm::GemmCoord const & problem_size) {
+
+    CUTLASS_TRACE_HOST("GemmWithEpilogueVisitor::can_implement()");
+
+    static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    bool isAMisaligned = false;
+    bool isBMisaligned = false;
+    bool isCMisaligned = false;
+
+    if (platform::is_same<LayoutA, layout::RowMajor>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajor>::value) {
+      isAMisaligned = problem_size.m() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajorInterleaved<32>>::value
+            || platform::is_same<LayoutA, layout::ColumnMajorInterleaved<64>>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    }
+
+    if (platform::is_same<LayoutB, layout::RowMajor>::value) {
+      isBMisaligned = problem_size.n() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::ColumnMajor>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::RowMajorInterleaved<32>>::value
+            || platform::is_same<LayoutB, layout::RowMajorInterleaved<64>>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    }
+
+    if (platform::is_same<LayoutC, layout::RowMajor>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajor>::value) {
+      isCMisaligned = problem_size.m() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajorInterleaved<32>>::value
+            || platform::is_same<LayoutC, layout::ColumnMajorInterleaved<64>>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    }
+
+    if (isAMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for A operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isBMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for B operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isCMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for C operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    CUTLASS_TRACE_HOST("  returning kSuccess");
+
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const &args) {
+    return can_implement(args.problem_size);
+  }
+
+  static size_t get_extra_workspace_size(Arguments const &args,
+                                         cutlass::gemm::GemmCoord const &grid_tiled_shape) {
+
+    return 0;
+  }
+
+  #define SPLIT_K_ENABLED 1
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+    int offset_k = 0;
+    int problem_size_k = params.problem_size.k();
+
+    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A);
+    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
+
+
+    #if SPLIT_K_ENABLED
+    //
+    // Fetch pointers based on mode.
+    //
+    if (params.mode == GemmUniversalMode::kGemm ||
+      params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+
+      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
+
+        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size;
+      }
+
+      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_A += threadblock_tile_offset.k() * params.batch_stride_A;
+      ptr_B += threadblock_tile_offset.k() * params.batch_stride_B;
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_A = static_cast<ElementA * const *>(params.ptr_A)[threadblock_tile_offset.k()];
+      ptr_B = static_cast<ElementB * const *>(params.ptr_B)[threadblock_tile_offset.k()];
+    }
+    #endif
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_A{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      offset_k,
+    };
+
+    cutlass::MatrixCoord tb_offset_B{
+      offset_k,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    };
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A and B operands
+    typename Mma::IteratorA iterator_A(
+      params.params_A,
+      ptr_A,
+      {params.problem_size.m(), problem_size_k},
+      thread_idx,
+      tb_offset_A);
+
+    typename Mma::IteratorB iterator_B(
+      params.params_B,
+      ptr_B,
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_B);
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - offset_k + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+    // Compute threadblock-scoped matrix multiply-add
+    mma(
+      gemm_k_iterations,
+      accumulators,
+      iterator_A,
+      iterator_B,
+      accumulators);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    //assume identity swizzle
+    MatrixCoord threadblock_offset(
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    );
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    //
+    // Construct the epilogue visitor
+    //
+
+    EpilogueVisitor epilogue_visitor(
+      params.epilogue_visitor,
+      shared_storage.epilogue.visitor,
+      params.problem_size.mn(),
+      thread_idx,
+      warp_idx,
+      lane_idx,
+      threadblock_offset);
+
+    if (params.mode == GemmUniversalMode::kGemm) {
+      // Indicate which position in a serial reduction the output operator is currently updating
+      epilogue_visitor.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+    }
+    else if (params.mode == GemmUniversalMode::kBatched || params.mode == GemmUniversalMode::kArray) {
+      epilogue_visitor.set_batch_index(threadblock_tile_offset.k());
+    }
+
+    // Construct the epilogue
+    Epilogue epilogue(
+      shared_storage.epilogue.epilogue,
+      thread_idx,
+      warp_idx,
+      lane_idx);
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(epilogue_visitor, accumulators);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/35_gemm_softmax/gemm_with_softmax.h b/examples/35_gemm_softmax/gemm_with_softmax.h
new file mode 100644
index 00000000..638017cc
--- /dev/null
+++ b/examples/35_gemm_softmax/gemm_with_softmax.h
@@ -0,0 +1,1154 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/**
+
+*/
+
+#pragma once
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <cmath>
+#include <iostream>
+#include <vector>
+#include <limits>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/arch/memory_sm75.h"
+
+#include "cutlass/gemm/kernel/default_gemm.h"
+#include "cutlass/gemm/kernel/default_gemm_complex.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "epilogue_with_visitor.h"
+#include "gemm_with_epilogue_visitor.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Kernel computes partial reduction
+//
+//
+// 2. Sum[m, n'] = sum_n(exp(D[m, n] - N[m, 0]))
+//
+template <
+  typename ElementD_,
+  typename ElementN_,
+  typename ElementSum_,
+  typename ElementSoft_,
+  int Alignment,
+  typename Shape_ = MatrixShape<4, 16>
+>
+class ApplySoftmax {
+public:
+
+  using ElementD = ElementD_;
+  using ElementN = ElementN_;
+  using ElementSum = ElementSum_;
+  using ElementSoft = ElementSoft_;
+
+  static int const kAlignment = Alignment;
+  using Shape = Shape_;
+
+  using Layout = cutlass::layout::RowMajor;
+
+  using TensorRefD = TensorRef<ElementD, Layout>;
+  using TensorRefN = TensorRef<ElementN, Layout>;
+  using TensorRefSum = TensorRef<ElementSum, Layout>;
+  using TensorRefSoft = TensorRef<ElementSoft, Layout>;
+
+  using FragmentSum = Array<ElementSum, kAlignment>;
+
+  //
+  // Arguments
+  //
+
+  struct Arguments {
+
+    MatrixCoord     extent;             ///< Extent of D and Softmax matrices
+    int             batch_count;        ///< Batch count
+    TensorRefD      ref_D;              ///< D matrix computed by GEMM+Max (input)
+    TensorRefN      ref_N;              ///< Norm tensor (input)
+    TensorRefSoft   ref_Soft;           ///< Softmax tensor (output)
+    int64_t         batch_stride_D;     ///< Batch stride for D tensor
+    int64_t         batch_stride_N;     ///< Batch stride for N tensor
+    int64_t         batch_stride_Soft;  ///< Batch stride for softmax tensor
+
+    //
+    // Methods
+    //
+    Arguments():
+      batch_count(1),
+      batch_stride_D(0),
+      batch_stride_N(0),
+      batch_stride_Soft(0)
+    { }
+
+    Arguments(
+      MatrixCoord     extent_,             ///< Extent of D and Softmax matrices
+      int             batch_count_,        ///< Batch count
+      TensorRefD      ref_D_,              ///< D matrix computed by GEMM+PartialReduce
+      TensorRefN      ref_N_,              ///< Output parameter for N
+      TensorRefSoft   ref_Soft_,           ///< Softmax
+      int64_t         batch_stride_D_ = 0,
+      int64_t         batch_stride_N_ = 0,
+      int64_t         batch_stride_Soft_ = 0
+    ):
+      extent(extent_),
+      batch_count(batch_count_),
+      ref_D(ref_D_),
+      ref_N(ref_N_),
+      ref_Soft(ref_Soft_),
+      batch_stride_D(batch_stride_D_),
+      batch_stride_N(batch_stride_N_),
+      batch_stride_Soft(batch_stride_Soft_)
+    {
+
+    }
+  };
+
+  //
+  // Params struct
+  //
+
+  struct Params {
+    Arguments args;
+
+    //
+    // Methods
+    //
+    Params() { }
+
+    Params(Arguments const &args_): args(args_) { }
+  };
+
+  //
+  // SharedStorage
+  //
+
+  struct SharedStorage {
+
+    AlignedArray<ElementSum, Shape::kCount> exchange;
+    AlignedArray<ElementSum, Shape::kRow> inv_sum;
+    AlignedArray<ElementSum, Shape::kRow> norm;
+
+  };
+
+private:
+
+public:
+
+  CUTLASS_DEVICE
+  ApplySoftmax() { }
+
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Phase 1. Reduction over contiguous dimension
+    reduce_partial(params, shared_storage);
+
+    __syncthreads();
+
+    // Phase 2. Final reduction within SMEM - yields sum_n(exp(D - N))
+    reduce_final(params, shared_storage);
+
+    __syncthreads();
+
+    // Phase 3. Apply
+    apply(params, shared_storage);
+  }
+
+private:
+
+  /// Partial reduction
+  CUTLASS_DEVICE
+  void reduce_partial(Params const &params, SharedStorage &shared_storage) {
+
+    //
+    // Sum over the matrix
+    //
+    using AccessTypeD = AlignedArray<ElementD, kAlignment>;
+
+    int block_batch = blockIdx.z;
+    int block_m = blockIdx.x * Shape::kRow;
+    int block_n = 0;
+
+    int thread_m = threadIdx.y;
+    int thread_n = threadIdx.x * kAlignment;
+
+    int idx_m = block_m + thread_m;
+    int idx_n = block_n + thread_n;
+
+    AccessTypeD *access_d = reinterpret_cast<AccessTypeD *>(
+      params.args.ref_D.data() +
+      params.args.batch_stride_D * block_batch +
+      params.args.ref_D.layout()({idx_m, idx_n}));
+
+    using ConvertS = cutlass::NumericArrayConverter<ElementSum, ElementD, kAlignment>;
+
+    using Plus = cutlass::plus<FragmentSum>;
+    using Minus = cutlass::minus<FragmentSum>;
+    using Exp   = cutlass::fast_exp_op<FragmentSum>;
+
+    ConvertS  convert_s;
+    Minus     minus;
+    Plus      plus;
+    Exp       exponential;
+
+    FragmentSum frag_Sum;
+    frag_Sum.clear();
+
+    if (idx_m < params.args.extent.row()) {
+
+      // Fetch the norm from GlobalMemory
+      ElementN norm = params.args.ref_N.data()[params.args.batch_stride_N * block_batch + idx_m];
+      ElementSum norm_cvt = ElementSum(norm);
+
+      FragmentSum norm_vec;
+
+      norm_vec.fill(norm_cvt);
+      shared_storage.norm[thread_m] = ElementSum(norm_cvt);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (
+        int idx = 0;
+        idx < params.args.extent.column();
+        idx += Shape::kColumn * kAlignment) {
+
+        if (idx_n < params.args.extent.column()) {
+
+          AccessTypeD fetch;
+          arch::global_load<AccessTypeD, sizeof(AccessTypeD)>(fetch, access_d, true);
+
+          auto tmp = exponential(minus(convert_s(fetch), norm_vec));
+
+          frag_Sum = plus(frag_Sum, tmp);
+        }
+
+        access_d += Shape::kColumn;
+        idx_n += Shape::kColumn * kAlignment;
+      }
+
+      // Sum the elements owned by one thread
+      ElementSum sum = frag_Sum[0];
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 1; i < kAlignment; ++i) {
+        sum += frag_Sum[i];
+      }
+
+      shared_storage.exchange.data()[threadIdx.x + threadIdx.y * Shape::kColumn] = sum;
+    }
+  }
+
+  /// Compute the final summation from data in SMEM
+  CUTLASS_DEVICE
+  void reduce_final(Params const &params, SharedStorage &shared_storage) {
+
+    //
+    // SMEM has shape `Shape::Row`-by-`Shape::Column`
+    //
+    // This computes a reduction across the `Column` dimension yielding a `Row-by-1` vector.
+    //
+
+    #if true
+    //
+    // Tuning parameters tradeoff ILP with latency
+    //
+    // During each step of the reduction, each thread performs `kAccesses` of vector size `kReduceVector`
+
+    // Tune the number of accesses per reduction
+    int const kAccesses = 2;
+
+    // Tune the memory access size
+    int const kReduceVector = 4;
+
+    //
+    // Static asserts to ensure integrity
+    //
+
+    static_assert(kAccesses * kReduceVector,
+      "Zero-size steps would infinitely loop.");
+
+    static_assert(
+      is_pow2<Shape::kColumn>::value &&
+      is_pow2<kAccesses>::value &&
+      is_pow2<kReduceVector>::value,
+      "Powers of two only.");
+
+    static_assert(!(Shape::kColumn % (kAccesses * kReduceVector)),
+      "Divisibility not satisfied");
+
+    //
+    // Reduction operators
+    //
+
+    using FragmentSum = Array<ElementSum, kReduceVector>;
+    using Plus = cutlass::plus<FragmentSum>;
+
+    Plus plus;
+
+    // Tree reduction
+    ElementSum *smem_ptr = shared_storage.exchange.data() + threadIdx.y * Shape::kColumn;
+
+    ElementSum final = ElementSum();
+
+    CUTLASS_PRAGMA_UNROLL
+    for (
+      int tidx_limit = Shape::kColumn / (kAccesses * kReduceVector);
+      tidx_limit > 0;
+      tidx_limit /= (kAccesses * kReduceVector)) {
+
+      if (threadIdx.x < tidx_limit) {
+        FragmentSum fetch;
+
+        arch::shared_load<sizeof(FragmentSum)>(
+          &fetch,
+          arch::cutlass_get_smem_pointer(smem_ptr + threadIdx.x * kReduceVector));
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 1; i < kAccesses; ++i) {
+          FragmentSum extra;
+
+          arch::shared_load<sizeof(FragmentSum)>(
+            &extra,
+            arch::cutlass_get_smem_pointer(
+              smem_ptr + threadIdx.x * kReduceVector + tidx_limit * kReduceVector * i));
+
+          fetch = plus(fetch, extra);
+        }
+
+        // Reduce to one element
+        final = fetch[0];
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 1; i < kReduceVector; ++i) {
+          final += fetch[i];
+        }
+      }
+      __syncthreads();
+
+      if (threadIdx.x < tidx_limit) {
+        smem_ptr[threadIdx.x] = final;
+      }
+      __syncthreads();
+    }
+
+    if (threadIdx.x == 0) {
+
+      int const kLgResidual =
+        (log2_down<Shape::kColumn>::value % log2_down<kAccesses * kReduceVector>::value);
+
+      // Certain shape combinations require an additional reduction step
+      if (kLgResidual) {
+        final = ElementSum();
+
+        int const kResidualVector = (1 << kLgResidual);
+        Array<ElementSum, kResidualVector> fetch;
+
+        arch::shared_load<sizeof(FragmentSum)>(
+          &fetch,
+          arch::cutlass_get_smem_pointer(smem_ptr));
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < kResidualVector; ++i) {
+          final += fetch[i];
+        }
+      }
+
+      // compute inverse
+      ElementSum inv_sum = cutlass::constants::one<ElementSum>() / final;
+
+      // Store to shared memory
+      shared_storage.inv_sum[threadIdx.y] = inv_sum;
+    }
+
+    #else
+
+    // Primitive serial reduction
+    if (threadIdx.x < Shape::kRow && threadIdx.y == 0) {
+      ElementSum *smem_ptr = shared_storage.exchange.data() + threadIdx.x * Shape::kColumn;
+
+      ElementSum sum = smem_ptr[0];
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 1; n < Shape::kColumn; ++n) {
+        sum += smem_ptr[n];
+      }
+
+      // compute inverse
+      ElementSum inv_sum = cutlass::constants::one<ElementSum>() / sum;
+
+      // Store to shared memory
+      shared_storage.inv_sum[threadIdx.x] = inv_sum;
+    }
+    #endif
+  }
+
+  /// Compute Softmax
+  CUTLASS_DEVICE
+  void apply(Params const &params, SharedStorage &shared_storage) {
+
+    using AccessTypeD = AlignedArray<ElementD, kAlignment>;
+
+    int block_batch = blockIdx.z;
+    int block_m = blockIdx.x * Shape::kRow;
+    int block_n = 0;
+
+    int thread_m = threadIdx.y;
+    int thread_n = threadIdx.x * kAlignment;
+
+    int idx_m = block_m + thread_m;
+    int idx_n = block_n + thread_n;
+
+    // Kill off thread if it is outside the row boundary
+    if (params.args.extent.row() <= idx_m) {
+      return;
+    }
+
+    //
+    // Setup pointers to load D again
+    //
+
+    using AccessTypeD = AlignedArray<ElementD, kAlignment>;
+    using AccessTypeSoft = AlignedArray<ElementSoft, kAlignment>;
+    using FragmentSoft = Array<ElementSoft, kAlignment>;
+    using ConvertSum = cutlass::NumericArrayConverter<ElementSum, ElementD, kAlignment>;
+    using ConvertSoft = cutlass::NumericArrayConverter<ElementSoft, ElementSum, kAlignment>;
+
+    using Mul = cutlass::multiplies<FragmentSum>;
+    using Minus = cutlass::minus<FragmentSum>;
+    using Exp   = cutlass::fast_exp_op<FragmentSum>;
+
+    ConvertSum   convert_sum;
+    ConvertSoft  convert_soft;
+
+    Minus     minus;
+    Mul       mul;
+    Exp       exponential;
+
+    AccessTypeD *access_d = reinterpret_cast<AccessTypeD *>(
+      params.args.ref_D.data() +
+      params.args.batch_stride_D * block_batch +
+      params.args.ref_D.layout()({idx_m, idx_n}));
+
+    AccessTypeSoft *access_soft = reinterpret_cast<AccessTypeSoft *>(
+      params.args.ref_Soft.data() +
+      params.args.batch_stride_Soft * block_batch +
+      params.args.ref_Soft.layout()({idx_m, idx_n}));
+
+    // Fetch inv_sum from SharedMemory
+    ElementSum inv_sum = shared_storage.inv_sum[thread_m];
+
+    // Fetch the norm from SharedMemory
+    ElementSum norm = shared_storage.norm[thread_m];
+
+    //
+    // Loop
+    //
+    CUTLASS_PRAGMA_UNROLL
+    for (
+      int idx = 0;
+      idx < params.args.extent.column();
+      idx += Shape::kColumn * kAlignment) {
+
+      if (idx_n < params.args.extent.column()) {
+
+        AccessTypeD fetch;
+        arch::global_load<AccessTypeD, sizeof(AccessTypeD)>(fetch, access_d, true);
+
+        FragmentSum result = mul(exponential(minus(convert_sum(fetch), norm)), inv_sum);
+        FragmentSoft soft  = convert_soft(result);
+
+        arch::global_store<FragmentSoft, sizeof(FragmentSoft)>(soft, access_soft, true);
+      }
+
+      access_d += Shape::kColumn;
+      access_soft += Shape::kColumn;
+      idx_n += Shape::kColumn * kAlignment;
+    }
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ThreadblockShape_,
+  int ThreadCount,
+  typename OutputTileIterator_,
+  typename ElementAccumulator_,
+  typename ElementwiseFunctor_
+>
+class EpilogueVisitorBiasMax {
+public:
+
+  using ThreadblockShape   = ThreadblockShape_;
+  static int const kThreadCount = ThreadCount;
+
+  using OutputTileIterator = OutputTileIterator_;
+  using ElementwiseFunctor = ElementwiseFunctor_;
+
+  static int const kIterations = OutputTileIterator::kIterations;
+  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
+
+  using ElementOutput = typename OutputTileIterator::Element;
+  using LayoutOutput = cutlass::layout::RowMajor;
+
+  using ElementAccumulator = ElementAccumulator_;
+
+  using AccumulatorFragment = Array<ElementAccumulator, kElementsPerAccess>;
+  using OutputVector = Array<ElementOutput, kElementsPerAccess>;
+  using TensorRefD = TensorRef<ElementOutput, LayoutOutput>;
+
+  /// Argument structure
+  struct Arguments {
+
+    typename ElementwiseFunctor::Params   elementwise;
+    TensorRefD                            ref_C;
+    TensorRefD                            ref_D;
+    float                                *ptr_Max;
+    int64_t                               batch_stride_C;
+    int64_t                               batch_stride_D;
+    int64_t                               batch_stride_Max;
+
+    //
+    // Methods
+    //
+    Arguments():
+      ptr_Max(nullptr),
+      batch_stride_C(0),
+      batch_stride_D(0),
+      batch_stride_Max(0)
+    {
+
+    }
+
+    Arguments(
+      typename ElementwiseFunctor::Params   elementwise_,
+      TensorRefD                            ref_C_,
+      TensorRefD                            ref_D_,
+      float                                *ptr_Max_,
+      int64_t                               batch_stride_C_,
+      int64_t                               batch_stride_D_,
+      int64_t                               batch_stride_Max_
+    ):
+      elementwise(elementwise_),
+      ref_C(ref_C_),
+      ref_D(ref_D_),
+      ptr_Max(ptr_Max_),
+      batch_stride_C(batch_stride_C_),
+      batch_stride_D(batch_stride_D_),
+      batch_stride_Max(batch_stride_Max_)
+    {
+
+    }
+  };
+
+  struct Params {
+
+    typename ElementwiseFunctor::Params   elementwise;
+    typename OutputTileIterator::Params   params_C;
+    typename OutputTileIterator::Params   params_D;
+    typename OutputTileIterator::Element *ptr_C;
+    typename OutputTileIterator::Element *ptr_D;
+    float                                *ptr_Max;
+    int64_t                               batch_stride_C;
+    int64_t                               batch_stride_D;
+    int64_t                               batch_stride_Max;
+
+    //
+    // Methods
+    //
+    CUTLASS_HOST_DEVICE
+    Params():
+      ptr_D(nullptr),
+      ptr_Max(nullptr)
+    {
+
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(Arguments const &args):
+      elementwise(args.elementwise),
+      params_C(args.ref_C.layout()),
+      params_D(args.ref_D.layout()),
+      ptr_C(args.ref_C.data()),
+      ptr_D(args.ref_D.data()),
+      ptr_Max(args.ptr_Max),
+      batch_stride_C(args.batch_stride_C),
+      batch_stride_D(args.batch_stride_D),
+      batch_stride_Max(args.batch_stride_Max)
+    {
+
+    }
+  };
+
+  /// Shared storage
+  struct SharedStorage {
+    float reduction[ThreadblockShape::kM];
+  };
+
+private:
+
+  Params const &                        params_;
+  SharedStorage &                       shared_storage_;
+  MatrixCoord                           extent_;
+  ElementwiseFunctor                    elementwise_;
+
+  OutputTileIterator                    iterator_C_;
+  OutputTileIterator                    iterator_D_;
+  typename OutputTileIterator::Fragment fragment_C_;
+  typename OutputTileIterator::Fragment fragment_D_;
+
+  ElementAccumulator                    alpha_;
+  ElementAccumulator                    beta_;
+
+  ElementAccumulator                    accum_max_;
+  int                                   threadblock_row_;
+
+public:
+
+  CUTLASS_DEVICE
+  EpilogueVisitorBiasMax(
+    Params const &params,                                         ///< Parameters routed to the epilogue
+    SharedStorage &shared_storage,                                ///< Shared storage needed by the functors here
+    MatrixCoord const &problem_size,                              ///< Problem size of the output
+    int thread_idx,                                               ///< Thread index within the threadblock
+    int warp_idx,                                                 ///< Warp index within the threadblock
+    int lane_idx,                                                 ///< Lane index within the warp
+    MatrixCoord const &threadblock_offset = MatrixCoord(0, 0)
+  ):
+    params_(params),
+    shared_storage_(shared_storage),
+    extent_(problem_size),
+    elementwise_(params.elementwise),
+    iterator_C_(params.params_C, params.ptr_C, problem_size, thread_idx, threadblock_offset),
+    iterator_D_(params.params_D, params.ptr_D, problem_size, thread_idx, threadblock_offset),
+    threadblock_row_(threadblock_offset.row())
+  {
+    alpha_ = (params.elementwise.alpha_ptr ? *params.elementwise.alpha_ptr : params.elementwise.alpha);
+    beta_ =  (params.elementwise.beta_ptr ? *params.elementwise.beta_ptr : params.elementwise.beta);
+
+    if (beta_ == ElementAccumulator()) {
+      iterator_C_.clear_mask();
+    }
+  }
+
+  /// Helper to indicate split-K behavior
+  CUTLASS_DEVICE
+  void set_k_partition(
+    int split_k_index,                                            ///< Index of this threadblock within split-K partitioned scheme
+    int split_k_slices) {                                         ///< Total number of split-K slices
+
+  }
+
+  /// Called to set the batch index
+  CUTLASS_DEVICE
+  void set_batch_index(int batch_idx) {
+    iterator_C_.add_pointer_offset(batch_idx * params_.batch_stride_C);
+    iterator_D_.add_pointer_offset(batch_idx * params_.batch_stride_D);
+  }
+
+  /// Called at the start of the epilogue just before iterating over accumulator slices
+  CUTLASS_DEVICE
+  void begin_epilogue() {
+
+    int const kStoreCount = (ThreadblockShape::kM + kThreadCount - 1) / kThreadCount;
+
+    clear_accum_max_();
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kStoreCount; ++i) {
+      shared_storage_.reduction[i * kThreadCount + threadIdx.x] = accum_max_;
+    }
+  }
+
+  /// Called at the start of one step before starting accumulator exchange
+  CUTLASS_DEVICE
+  void begin_step(int step_idx) {
+    fragment_D_.clear();
+    fragment_C_.clear();
+
+    iterator_C_.load(fragment_C_);
+    ++iterator_C_;
+  }
+
+  /// Called at the start of a row
+  CUTLASS_DEVICE
+  void begin_row(int row_idx) {
+
+  }
+
+  /// Called after accumulators have been exchanged for each accumulator vector
+  CUTLASS_DEVICE
+  void visit(
+    int row_idx,
+    int column_idx,
+    int frag_idx,
+    AccumulatorFragment const &accum) {
+
+    NumericArrayConverter<ElementAccumulator, ElementOutput, kElementsPerAccess> source_converter;
+    OutputVector &source_vector = reinterpret_cast<OutputVector *>(&fragment_C_)[frag_idx];
+
+    AccumulatorFragment source = source_converter(source_vector);
+    AccumulatorFragment result = alpha_ * accum + beta_ * source;
+
+    MatrixCoord thread_offset =
+      iterator_D_.thread_start() +
+      OutputTileIterator::ThreadMap::iteration_offset(frag_idx);
+
+    bool column_guard = (thread_offset.column() < extent_.column());
+
+    // Compute the maximum within one row
+    if (!column_idx) {
+
+      // This is the first fragment in a new row
+      if (column_guard) {
+        accum_max_ = maximum_accumulator_(accum);
+      }
+    }
+    else {
+
+      // This is an additional fragment in the same row
+      if (column_guard) {
+        accum_max_ = maximum_accumulator_(accum, accum_max_);
+      }
+    }
+
+    // If this is the last vector in the row, compute the final max and store it out
+    if (column_idx + 1 == OutputTileIterator::ThreadMap::Iterations::kColumn) {
+
+      float float_max_element = float(accum_max_);
+
+      int thread_row = thread_offset.row() - threadblock_row_;
+
+      // Shared memory atomic operation to partially reduce the maximum element
+      atomicMax(
+        reinterpret_cast<int *>(shared_storage_.reduction + thread_row),
+        reinterpret_cast<int const &>(float_max_element)
+      );
+
+      clear_accum_max_();
+    }
+
+    // Convert to the output
+    NumericArrayConverter<ElementOutput, ElementAccumulator, kElementsPerAccess> output_converter;
+    OutputVector &output = reinterpret_cast<OutputVector *>(&fragment_D_)[frag_idx];
+    output = output_converter(result);
+  }
+
+  /// Called at the start of a row
+  CUTLASS_DEVICE
+  void end_row(int row_idx) {
+
+  }
+
+  /// Called after all accumulator elements have been visited
+  CUTLASS_DEVICE
+  void end_step(int step_idx) {
+
+    iterator_D_.store(fragment_D_);
+    ++iterator_D_;
+  }
+
+  /// Called after all steps have been completed
+  CUTLASS_DEVICE
+  void end_epilogue() {
+
+    __syncthreads();
+
+    int block_batch = blockIdx.z;
+    int tidx_m = threadblock_row_ + threadIdx.x;
+
+    float float_max_element = shared_storage_.reduction[threadIdx.x];
+
+    if (tidx_m < extent_.row()) {
+
+      atomicMax(
+        reinterpret_cast<int *>(
+          params_.ptr_Max +
+          params_.batch_stride_Max * block_batch +
+          tidx_m),
+        reinterpret_cast<int const &>(float_max_element)
+      );
+    }
+  }
+
+private:
+
+  CUTLASS_DEVICE
+  void clear_accum_max_() {
+
+    uint32_t float_max_bits = 0xff7fffff;   // -FLT_MAX
+
+    accum_max_ = reinterpret_cast<float const &>(float_max_bits);
+  }
+
+  CUTLASS_DEVICE
+  float maximum_accumulator_(AccumulatorFragment const &accum) {
+    ElementAccumulator max_ = accum[0];
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 1; i < AccumulatorFragment::kElements; ++i) {
+      max_ = fast_max(max_, ElementAccumulator(accum[i]));
+    }
+
+    return max_;
+  }
+
+  CUTLASS_DEVICE
+  ElementAccumulator maximum_accumulator_(AccumulatorFragment const &accum, ElementAccumulator max_) {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < AccumulatorFragment::kElements; ++i) {
+      max_ = fast_max(max_, ElementAccumulator(accum[i]));
+    }
+
+    return max_;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+///
+template <
+  typename ElementA_,
+  typename LayoutA_,
+  typename ElementB_,
+  typename LayoutB_,
+  typename ElementC_,
+  typename ElementCompute_,
+  int Alignment = 128 / cutlass::sizeof_bits<ElementA_>::value,
+  typename ElementSum_ = ElementCompute_,
+  typename ElementSoftmax_ = ElementC_
+>
+class GemmSoftmax {
+public:
+
+  ///////////////////////////////////////////////////////////////////////////////////////////////
+
+  //
+  // Type definitions
+  //
+
+  using ElementA = ElementA_;
+  using ElementB = ElementB_;
+  using ElementC = ElementC_;
+  using ElementCompute = ElementCompute_;
+  using ElementSum = ElementSum_;
+  using ElementSoft = ElementSoftmax_;
+
+  using LayoutA = LayoutA_;
+  using LayoutB = LayoutB_;
+
+  static int const kAlignment = Alignment;
+
+  ///////////////////////////////////////////////////////////////////////////////////////////////
+
+  /// Linear scaling operator
+  using EpilogueFunctorOp = cutlass::epilogue::thread::LinearCombination<
+    ElementC,
+    128 / cutlass::sizeof_bits<ElementC>::value,
+    ElementCompute,
+    ElementCompute
+  >;
+
+  ///////////////////////////////////////////////////////////////////////////////////////////////
+
+  // This is a mandatory data type for the atomic reduction in the GEMM epilogue to function.
+  using ElementN = float;
+
+  // These are mandatory layouts.
+  using LayoutC = cutlass::layout::RowMajor;
+  using LayoutN = cutlass::layout::RowMajor;
+  using LayoutSoft = cutlass::layout::RowMajor;
+
+  using TensorRefA = TensorRef<ElementA, LayoutA>;
+  using TensorRefB = TensorRef<ElementB, LayoutB>;
+  using TensorRefC = TensorRef<ElementC, LayoutC>;
+  using TensorRefN = TensorRef<ElementN, LayoutN>;
+  using TensorRefSoft = TensorRef<ElementSoft, LayoutSoft>;
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 32>;
+  using WarpShape        = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+
+  using OperatorClass       = cutlass::arch::OpClassTensorOp;
+  using ArchTag             = cutlass::arch::Sm80;
+  static int const kStages  = 3;
+
+  using ThreadblockSwizzle = cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle;
+
+  ///////////////////////////////////////////////////////////////////////////////////////////////
+
+  // basic GEMM kernel
+  using DefaultGemmKernel = typename cutlass::gemm::kernel::DefaultGemm<
+    ElementA,
+    LayoutA,
+    kAlignment,
+    ElementB,
+    LayoutB,
+    kAlignment,
+    ElementC,
+    LayoutC,
+    ElementCompute,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueFunctorOp,
+    ThreadblockSwizzle,
+    kStages,
+    true,
+    typename cutlass::gemm::device::DefaultGemmConfiguration<
+        OperatorClass, ArchTag, ElementA, ElementB, ElementC, ElementCompute>::Operator,
+    cutlass::gemm::SharedMemoryClearOption::kNone
+  >::GemmKernel;
+
+  ///////////////////////////////////////////////////////////////////////////////////////////////
+
+  // Epilogue visitor
+  using EpilogueVisitor = kernel::EpilogueVisitorBiasMax<
+    ThreadblockShape,
+    DefaultGemmKernel::kThreadCount,
+    typename DefaultGemmKernel::Epilogue::OutputTileIterator,
+    ElementCompute,
+    EpilogueFunctorOp
+  >;
+
+  /// Epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::EpilogueWithVisitorFromExistingEpilogue<
+    EpilogueVisitor,
+    typename DefaultGemmKernel::Epilogue
+  >::Epilogue;
+
+  // GEMM
+  using GemmKernel = gemm::kernel::GemmWithEpilogueVisitor<
+    typename DefaultGemmKernel::Mma,
+    Epilogue,
+    ThreadblockSwizzle
+  >;
+
+  // Softmax kernel
+  using SoftmaxApplyKernel = kernel::ApplySoftmax<
+    ElementC,
+    ElementN,
+    ElementSum,
+    ElementSoft,
+    kAlignment,
+    MatrixShape<
+      1, 1024
+    >
+  >;
+
+public:
+
+  /// Arguments class
+  struct Arguments {
+
+    typename GemmKernel::Arguments         gemm;
+    typename SoftmaxApplyKernel::Arguments softmax;
+
+    //
+    // Methods
+    //
+    Arguments() { }
+
+    Arguments(
+      cutlass::gemm::GemmCoord problem_size,
+      int32_t    batch_count_,
+      TensorRefA ref_A_,
+      TensorRefB ref_B_,
+      TensorRefC ref_C_,
+      TensorRefC ref_D_,
+      typename EpilogueFunctorOp::Params linear_scaling,
+      TensorRefN ref_N_,
+      TensorRefSoft ref_Softmax_,
+      int64_t batch_stride_A_ = 0,
+      int64_t batch_stride_B_ = 0,
+      int64_t batch_stride_C_ = 0,
+      int64_t batch_stride_D_ = 0,
+      int64_t batch_stride_Max_ = 0,
+      int64_t batch_stride_Softmax_ = 0
+    ):
+      gemm(
+        cutlass::gemm::GemmUniversalMode::kBatched,
+        problem_size,
+        batch_count_,
+        ref_A_,
+        ref_B_,
+        batch_stride_A_,
+        batch_stride_B_,
+        typename EpilogueVisitor::Arguments(
+          linear_scaling,
+          ref_C_,
+          ref_D_,
+          ref_N_.data(),
+          batch_stride_C_,
+          batch_stride_D_,
+          batch_stride_Max_
+        )
+      ),
+      softmax(
+        MatrixCoord(problem_size.m(), problem_size.n()),
+        batch_count_,
+        ref_D_,
+        ref_N_,
+        ref_Softmax_,
+        batch_stride_D_,
+        batch_stride_Max_,
+        batch_stride_Softmax_
+      )
+    {
+
+    }
+  };
+
+  struct Params {
+
+    typename GemmKernel::Params         gemm;
+    typename SoftmaxApplyKernel::Params softmax;
+
+    //
+    // Methods
+    //
+    Params() { }
+
+    Params(Arguments const &args):
+      gemm(args.gemm),
+      softmax(args.softmax)
+    {
+
+    }
+  };
+
+public:
+
+  // Gemm
+
+
+  //
+  // Methods
+  //
+
+private:
+
+  Params params_;
+
+public:
+
+  /// Ctor
+  GemmSoftmax() {
+
+  }
+
+  /// Initialize
+  Status initialize(Arguments const &args) {
+
+    params_ = Params(args);
+
+    return cutlass::Status::kSuccess;
+  }
+
+  /// Run
+  Status run(cudaStream_t stream) {
+
+    //
+    // Launch the GEMM + max kernel
+    //
+
+    dim3 gemm_grid = ThreadblockSwizzle().get_grid_shape(params_.gemm.grid_tiled_shape);
+    dim3 gemm_block(GemmKernel::kThreadCount, 1, 1);
+
+    int gemm_smem_size = int(sizeof(typename GemmKernel::SharedStorage));
+
+    cutlass::Kernel<GemmKernel><<<gemm_grid, gemm_block, gemm_smem_size, stream>>>(params_.gemm);
+
+    cudaError_t result = cudaGetLastError();
+
+    if (result != cudaSuccess) {
+      return cutlass::Status::kErrorInternal;
+    }
+
+    //
+    // Launch the SoftmaxApplyKernel
+    //
+
+    dim3 apply_block(SoftmaxApplyKernel::Shape::kColumn, SoftmaxApplyKernel::Shape::kRow);
+
+    int cta_rows = SoftmaxApplyKernel::Shape::kRow;
+    int cta_columns = SoftmaxApplyKernel::Shape::kColumn * SoftmaxApplyKernel::kAlignment;
+
+    dim3 apply_grid(
+      (params_.softmax.args.extent.row() + cta_rows - 1) / cta_rows,
+      (params_.softmax.args.extent.column() + cta_columns - 1) / cta_columns,
+      params_.softmax.args.batch_count);
+
+    Kernel<SoftmaxApplyKernel><<<
+      apply_grid, apply_block, sizeof(typename SoftmaxApplyKernel::SharedStorage), stream
+    >>>(params_.softmax);
+
+    result = cudaGetLastError();
+
+    if (result != cudaSuccess) {
+      return cutlass::Status::kErrorInternal;
+    }
+
+    return cutlass::Status::kSuccess;
+  }
+
+  /// Function call operator
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/examples/36_gather_scatter_fusion/CMakeLists.txt b/examples/36_gather_scatter_fusion/CMakeLists.txt
new file mode 100644
index 00000000..28edd478
--- /dev/null
+++ b/examples/36_gather_scatter_fusion/CMakeLists.txt
@@ -0,0 +1,32 @@
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+cutlass_example_add_executable(
+  36_gather_scatter_fusion
+  gather_scatter_fusion.cu
+  )
diff --git a/examples/36_gather_scatter_fusion/gather_scatter_fusion.cu b/examples/36_gather_scatter_fusion/gather_scatter_fusion.cu
new file mode 100644
index 00000000..f43d1160
--- /dev/null
+++ b/examples/36_gather_scatter_fusion/gather_scatter_fusion.cu
@@ -0,0 +1,537 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+// This example fuses gather before GEMM and scatter after GEMM into the same
+// GEMM kernel.  Gather and scatter operation is controled by an index vector
+// to select rows or columns from A, B, C or D matrices.
+//
+// Suppose, all matrices are column major.  The pseudo code of the fused kernel
+// in this example is essentially
+//
+//    for (int i = 0; i < problem_size.m(); ++i) {
+//      for (int j = 0; j < options.index_size; ++j) {
+//        int b_c_d_col = tensor_indices.at({j, 0});
+//
+//        for (int k = 0; k < options.index_size; ++k) {
+//            int a_col = tensor_indices.at({k, 0});
+//            tensor_d_ref.at({i, b_c_d_col}) +=
+//              alpha * tensor_a.at({i, a_col}) * tensor_b.at({k, b_c_d_col}) + beta * tensor_c.at({i, b_c_d_col});
+//        }
+//      }
+//
+// Note that the index vector contains unique random integers with max to be N - 1
+//
+// The gather/scatter operation works best when we can still keep the biggest
+// alignment. For example, when the matrix is row major, we select rows. When
+// the matrix is column major, we selct columns.
+//
+// Not all the combination of gather and scatter are legal. For example, if A is
+// row major and C/D is column major, we cannot gather A and scatter C/D at the
+// same time.
+//
+// Also, we don't check the index value is legal and index array point is valid
+// for the sake of the performance.
+ 
+#include <stdlib.h>
+#include <stdio.h>
+#include <time.h>
+#include <math.h>
+#include <assert.h>
+#include <cuda_runtime.h>
+
+#include <algorithm>
+#include <iostream>
+#include <random>
+#include <numeric>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm_universal.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/reference/device/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "helper.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Result structure
+struct Result {
+
+  double runtime_ms;
+  double gflops;
+  cutlass::Status status;
+  cudaError_t error;
+  bool passed;
+
+  //
+  // Methods
+  //
+
+  Result(
+    double runtime_ms = 0,
+    double gflops = 0,
+    cutlass::Status status = cutlass::Status::kSuccess,
+    cudaError_t error = cudaSuccess
+  ):
+    runtime_ms(runtime_ms), gflops(gflops), status(status), error(error), passed(true) { }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+
+  bool help;
+
+  cutlass::gemm::GemmCoord problem_size;
+  int index_size;
+
+  bool reference_check;
+  int iterations;
+  
+  Options():
+    help(false),
+    problem_size({248, 1024, 1024}),
+    index_size(240),
+    reference_check(true),
+    iterations(20) { }
+
+  bool valid() {
+    return true;
+  }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+    }
+
+    cmd.get_cmd_line_argument("m", problem_size.m());
+    cmd.get_cmd_line_argument("n", problem_size.n());
+    cmd.get_cmd_line_argument("k", problem_size.k());
+
+    cmd.get_cmd_line_argument("index_size", index_size);
+    
+    cmd.get_cmd_line_argument("iterations", iterations);
+
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "36_gather_scatter_fusion example\n\n"
+      << "  This example uses the CUTLASS Library to fuse gather/scatter into GEMM\n\n"
+      << "Options:\n\n"
+      << "  --help                      If specified, displays this usage statement.\n\n"
+      << "  --m=<int>                   GEMM M dimension\n"
+      << "  --n=<int>                   GEMM N dimension\n"
+      << "  --k=<int>                   GEMM K dimension\n"
+      << "  --index_size=<int>          size of N dimension index\n"
+      << "  --iterations=<int>          Number of profiling iterations to perform.\n\n";
+
+    out << "\n\nExamples:\n\n"
+      << "$ ./examples/36_gather_scatter_fusion/36_gather_scatter_fusion --m=1024 --n=512 --k=1024 \\\n"
+      << "     --index_size=128\n\n";
+
+    return out;
+  }
+
+  /// Compute performance in GFLOP/s
+  double gflops(double runtime_s) const {
+
+    // Number of real-valued multiply-adds 
+    int64_t fmas = problem_size.product();
+    
+    // Two flops per multiply-add
+    return 2.0 * double(fmas) / double(1.0e9) / runtime_s;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// The code section below describes datatype for input, output matrices and computation between
+// elements in input matrices.
+using ElementAccumulator = float;                   // <- data type of accumulator
+using ElementComputeEpilogue = ElementAccumulator;  // <- data type of epilogue operations
+using ElementInputA = cutlass::half_t;;             // <- data type of elements in input matrix A
+using ElementInputB = cutlass::half_t;;             // <- data type of elements in input matrix B
+using ElementOutput = float;                        // <- data type of elements in output matrix D
+
+// The code section below describes matrix layout of input and output matrices.
+// Column Major for Matrix A, B and C.
+//
+using LayoutInputA = cutlass::layout::ColumnMajor;
+using LayoutInputB = cutlass::layout::ColumnMajor;
+using LayoutOutput = cutlass::layout::ColumnMajor;
+
+// This code section describes whether you want to use tensor cores or regular SIMT cores on GPU SM
+using MMAOp = cutlass::arch::OpClassTensorOp;
+
+// This code section describes CUDA SM architecture number
+using SmArch = cutlass::arch::Sm80;
+
+// This code section describes the tile size a thread block will compute
+using ShapeMMAThreadBlock =
+    cutlass::gemm::GemmShape<128, 128, 32>;  // <- threadblock tile M = 128, N = 128, K = 32
+// This code section describes tile size a warp will compute
+using ShapeMMAWarp = cutlass::gemm::GemmShape<64, 64, 32>;  // <- warp tile M = 64, N = 64, K = 32 
+// This code section describes the size of MMA op
+using ShapeMMAOp = cutlass::gemm::GemmShape<16, 8, 16>;  // <- MMA Op tile M = 8, N = 8, K = 4
+// 16, 8, 8 -> Turing
+// 16, 8, 16 -> Ampere
+
+// This code section describes how threadblocks are scheduled on GPU
+using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;  // <- ??
+
+// Define the epilogue operation as LinearCombination. This is approximately equal to
+//
+//    d_ij = alpha * sum_k(a_ik * b_kj) + c_ij
+//
+using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
+    ElementOutput,                                        // <- data type of output matrix
+    128 / cutlass::sizeof_bits<ElementOutput>::value,     // <- this is the number of elements per
+                                                          // vectorized memory access. For half
+                                                          // precision, it's 8 elements. This becomes
+                                                          // the vector width of math instructions in
+                                                          // epilogue too
+    ElementAccumulator,                                   // <- data type of accumulator
+    ElementComputeEpilogue,                               // <- data type for alpha in linear combination function
+    cutlass::epilogue::thread::ScaleType::Nothing>;       // <- C
+
+// Number of pipelines you want to use
+constexpr int NumStages = 5;
+// Ampere -> 4/5
+// Turing -> 2
+
+using Gemm = cutlass::gemm::device::GemmUniversal<ElementInputA,
+                                                  LayoutInputA,
+                                                  ElementInputB,
+                                                  LayoutInputB,
+                                                  ElementOutput,
+                                                  LayoutOutput,
+                                                  ElementAccumulator,
+                                                  MMAOp,
+                                                  SmArch,
+                                                  ShapeMMAThreadBlock,
+                                                  ShapeMMAWarp,
+                                                  ShapeMMAOp,
+                                                  EpilogueOp,
+                                                  SwizzleThreadBlock,
+                                                  NumStages,
+                                                  8,     /*alignmentA*/
+                                                  8,     /*alignmengB*/
+                                                  cutlass::arch::OpMultiplyAdd,
+                                                  cutlass::ComplexTransform::kNone,
+                                                  cutlass::ComplexTransform::kNone,
+                                                  true,  /*GatherA*/
+                                                  true,  /*GatherB*/
+                                                  true   /*ScatterD*/
+                                                 >;
+
+int run(Options &options) {
+
+  // ================================================================================
+  // Initialization setup
+
+  // Create a tuple of problem size for matrix multiplication
+  cutlass::gemm::GemmCoord problem_size = options.problem_size;
+
+  // Create a tuple of problem size for matrix multiplication
+  cutlass::gemm::GemmCoord problem_size_real(problem_size.m(),
+                                             options.index_size,
+                                             options.index_size);
+
+  // Initialize tensors using CUTLASS helper functions
+  cutlass::HostTensor<ElementInputA, LayoutInputA> tensor_a(
+      problem_size.mk());  // <- Create matrix A with dimensions M x K
+  cutlass::HostTensor<ElementInputB, LayoutInputB> tensor_b(
+      cutlass::make_Coord(options.index_size, problem_size.n()));  // <- Create matrix B with dimensions K x N
+  cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_c(
+      problem_size.mn());  // <- Create matrix C with dimensions M x N 
+  cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_d_scattered(
+      problem_size.mn());  // <- Create matrix D with dimensions M x N used to store output from
+                           // CUTLASS kernel
+
+  // Fill input and output matrices on host using CUTLASS helper functions
+  cutlass::reference::host::TensorFillRandomUniform(
+      tensor_a.host_view(),
+      1,
+      ElementInputA(7),
+      ElementInputA(-8),
+      0);  // <- Fill matrix A on host with uniform-distribution random data
+
+  cutlass::reference::host::TensorFillRandomUniform(
+      tensor_b.host_view(),
+      1,
+      ElementInputA(7),
+      ElementInputA(-8),
+      0);  // <- Fill matrix B on host with uniform-distribution random data
+
+  cutlass::reference::host::TensorFill(
+      tensor_c.host_view());  // <- Fill matrix C on host with zeros
+
+  cutlass::reference::host::TensorFill(
+    tensor_d_scattered.host_view());  // <- fill matrix D on host with zeros
+
+  cutlass::HostTensor<int, LayoutOutput> tensor_indices(
+      {options.index_size, 1});  // <- Create scatter indices with dimensions val_len x 1
+
+  // <- Fill tensor_b_indices on host with unique random integers
+  std::vector<int> to_fill(problem_size.n()) ; // vector with ints.
+  std::iota (std::begin(to_fill), std::end(to_fill), 0); // Fill with 0, 1, ...., problem_size.n()
+  std::random_shuffle(to_fill.begin(), to_fill.end());
+  memcpy(tensor_indices.host_data(), to_fill.data(), options.index_size * sizeof(int));
+
+  // Copy data from host to GPU
+  tensor_a.sync_device();
+  tensor_b.sync_device();
+  tensor_indices.sync_device();
+  tensor_c.sync_device();
+  tensor_d_scattered.sync_device();
+
+  // Initialize alpha/beta for dot product computation
+  ElementComputeEpilogue alpha = ElementComputeEpilogue(1);
+  ElementComputeEpilogue beta = ElementComputeEpilogue(1);
+
+  // Split K dimension into 1 partitions
+  int split_k_slices = 1;
+
+  // Create a tuple of gemm kernel arguments. This is later passed as arguments to launch
+  // instantiated CUTLASS kernel
+  typename Gemm::Arguments arguments{
+      cutlass::gemm::GemmUniversalMode::kGemm, 
+      problem_size_real,                  // <- problem size of matrix multiplication
+      split_k_slices,                     // <- k-dimension split factor
+      {alpha, beta},                      // <- alpha, beta
+      tensor_a.device_data(),             // <- reference to matrix A on device
+      tensor_b.device_data(),             // <- reference to matrix B on device
+      tensor_c.device_data(),             // <- reference to matrix C on device
+      tensor_d_scattered.device_data(),   // <- reference to matrix D on device
+      tensor_a.layout().capacity(problem_size.mk()),
+      tensor_b.layout().capacity(cutlass::make_Coord(options.index_size, problem_size.n())),
+      tensor_c.layout().capacity(problem_size.mn()),
+      tensor_d_scattered.layout().capacity(problem_size.mn()),
+      tensor_a.layout().stride(),
+      tensor_b.layout().stride(),
+      tensor_c.layout().stride(),
+      tensor_d_scattered.layout().stride(),
+      tensor_indices.device_data(),       // <- pointer to index vector to gather A on device
+      tensor_indices.device_data(),       // <- pointer to index vector to gather B on device
+      tensor_indices.device_data()};      // <- pointer to index vector to scatter D on device
+
+  // Using the arguments, query for extra workspace required for matrix multiplication computation
+  size_t workspace_size = Gemm::get_workspace_size(arguments);
+
+  // Allocate workspace memory
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+  // Instantiate CUTLASS kernel depending on templates
+  Gemm gemm_op;
+
+  // Check the problem size is supported or not 
+  cutlass::Status status = gemm_op.can_implement(arguments);
+  CUTLASS_CHECK(status);
+
+  // Initialize CUTLASS kernel with arguments and workspace pointer
+  status = gemm_op.initialize(arguments, workspace.get());
+  CUTLASS_CHECK(status);
+
+  // CPU reference calculation
+  cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_d_ref(problem_size.mn());
+  cutlass::reference::host::TensorFill(
+    tensor_d_ref.host_view());  // <- Fill matrix D on host with zeros
+
+  status = gemm_op();
+  cudaDeviceSynchronize();
+  CUTLASS_CHECK(status);
+
+  if (options.reference_check) {
+    for (int i = 0; i < problem_size.m(); ++i) {
+      for (int j = 0; j < options.index_size; ++j) {
+        int b_c_d_col = tensor_indices.at({j, 0});
+
+        for (int k = 0; k < options.index_size; ++k) {
+            int a_col = tensor_indices.at({k, 0});
+            tensor_d_ref.at({i, b_c_d_col}) +=
+              alpha * tensor_a.at({i, a_col}) * tensor_b.at({k, b_c_d_col}) + beta * tensor_c.at({i, b_c_d_col});
+        }
+      }
+    }
+
+    // Copy output data from CUTLASS and reference kernel to host for comparison
+    tensor_d_scattered.sync_host();
+  
+    bool passed = cutlass::reference::host::TensorEquals(
+                    tensor_d_scattered.host_view(),
+                    tensor_d_ref.host_view());
+
+    if (!passed) {
+      std::cout << "Failed!\n";
+
+      std::stringstream fname;
+      fname << "error_gather_GEMM_scatter_fusion.txt";
+      std::cerr << "Dumping results in " << fname.str() << "\n";
+
+      std::ofstream file(fname.str());
+
+      file 
+        << "A =\n" << tensor_a.host_view()
+        << "\nB =\n" << tensor_b.host_view()
+        << "\nindices =\n" << tensor_indices.host_view()
+        << "\nC =\n" << tensor_c.host_view()
+        << "\n\nReference =\n" << tensor_d_ref.host_view()
+        << "\nComputed =\n" << tensor_d_scattered.host_view();
+      return -1;
+    } else {
+      std::cout << "Passed!\n";
+    }
+  }
+
+  // Result structure
+  Result result;
+
+  //
+  // Construct events
+  //
+
+  cudaEvent_t events[2];
+
+  for (auto & event : events) {
+    result.error = cudaEventCreate(&event);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return -1;
+    }
+  }
+
+  // Record an event at the start of a series of GEMMs
+  result.error = cudaEventRecord(events[0]);
+  if (result.error != cudaSuccess) {
+    std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+    return -1;
+  }
+
+  //
+  // Run profiling loop
+  //
+
+  for (int iter = 0; iter < options.iterations; ++iter) {
+    // Launch initialized CUTLASS kernel
+    status = gemm_op();
+    CUTLASS_CHECK(status);
+  }
+
+  //
+  // Stop profiling loop
+  //
+
+  // Record an event when the GEMMs are complete
+  result.error = cudaEventRecord(events[1]);
+  if (result.error != cudaSuccess) {
+    std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+    return -1;
+  }
+
+  // Wait for work on the device to complete.
+  result.error = cudaEventSynchronize(events[1]);
+  if (result.error != cudaSuccess) {
+    std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl;
+    return -1;
+  }
+
+  // Measure elapsed runtime
+  float runtime_ms = 0;
+  result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
+  if (result.error != cudaSuccess) {
+    std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl;
+    return -1;
+  }
+
+  // Compute average runtime and GFLOPs.
+  result.runtime_ms = double(runtime_ms) / double(options.iterations);
+  result.gflops = options.gflops(result.runtime_ms / 1000.0);
+
+  // Cleanup
+  for (auto event : events) {
+    (void)cudaEventDestroy(event);
+  }
+
+  std::cout << "Runtime: " << result.runtime_ms << " ms\n";
+  std::cout << " GFLOPs: " << result.gflops << "\n";
+
+  return 0;
+}
+
+int main(int argc, const char ** argv) {
+  bool notSupported = false;
+
+  // Ampere Tensor Core operations exposed with mma.sync are first available in CUDA 11.0.
+  //
+  // CUTLASS must be compiled with CUDA 11 Toolkit to run Conv2dFprop examples.
+  if (!(__CUDACC_VER_MAJOR__ > 11 || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 0))) {
+    std::cerr << "Ampere Tensor Core operations must be compiled with CUDA 11.0 Toolkit or later." << std::endl;
+    notSupported = true;
+  }
+
+  cudaDeviceProp props;
+  CUDA_CHECK(cudaGetDeviceProperties(&props, 0));
+
+  if (!(props.major > 8 || (props.major == 8 && props.minor >= 0))) {
+    std::cerr << "Ampere Tensor Ops must be run on a machine with compute capability at least 80."
+              << std::endl;
+    notSupported = true;
+  }
+
+  if (notSupported) {
+    return 0;
+  }
+
+  Options options;
+  options.parse(argc, argv);
+
+  if (options.help) {
+    options.print_usage(std::cout) << "\n";
+    return 0;
+  }
+
+  if (!options.valid()) {
+    std::cerr << "Invalid problem." << "\n";
+    return -1;
+  }
+
+  return run(options);
+}
diff --git a/examples/40_cutlass_py/test-cutlass-py.py b/examples/40_cutlass_py/test-cutlass-py.py
new file mode 100644
index 00000000..34006dcc
--- /dev/null
+++ b/examples/40_cutlass_py/test-cutlass-py.py
@@ -0,0 +1,170 @@
+
+# System modules
+import numpy as np
+import os.path
+import sys
+import ctypes
+
+# CUDA Python modules
+from cuda import cuda
+from cuda import nvrtc
+
+# CUTLASS modules
+import library
+import manifest as cutlass_manifest
+import generator
+import rt
+
+
+#
+# Construct an SGEMM
+#
+
+manifest = cutlass_manifest.Manifest()
+
+generator.GenerateSM50_Simt(manifest, "11.5.0")
+
+#
+# Construct a GEMM operation
+#
+
+operation = manifest.operations_by_name['cutlass_simt_sgemm_128x128_8x2_nt_align1']
+
+#
+# Construct a runtime GEMM operation
+#
+gemm = rt.Gemm(operation)
+
+#
+# Initialize context
+#
+err, = cuda.cuInit(0)
+
+if err != cuda.CUresult.CUDA_SUCCESS:
+  raise RuntimeError("CUDA Error %s" % str(err))
+
+err, device = cuda.cuDeviceGet(0)
+
+if err != cuda.CUresult.CUDA_SUCCESS:
+  raise RuntimeError("CUDA Error %s" % str(err))
+
+err, context = cuda.cuCtxCreate(0, device)
+
+if err != cuda.CUresult.CUDA_SUCCESS:
+  raise RuntimeError("CUDA Error %s" % str(err))
+
+#
+# Construct a module
+#
+
+architectures = [80,]
+include_paths = [
+  '/home/akerr/repositories/git/dlarch-fastkernels/cutlass-2.x/include',
+  '/home/akerr/repositories/git/dlarch-fastkernels/cutlass-2.x/tools/util/include',
+  '/home/akerr/sdk/cuda/11.5-tot-2021-10-28/include', 
+]
+
+compilation_options = rt.CompilationOptions(architectures, include_paths)
+
+module = rt.Module('module.cu', [gemm], compilation_options)
+
+#
+# Setup a workspace
+#
+
+M, N, K = (128, 128, 128)
+
+tensor_A = np.ndarray(M * K, dtype=np.float32)
+tensor_B = np.ndarray(N * K, dtype=np.float32)
+tensor_C = np.ndarray(M * N, dtype=np.float32)
+tensor_D = np.ndarray(M * N, dtype=np.float32)
+
+err, tensor_A_d = cuda.cuMemAlloc(tensor_A.size * tensor_A.itemsize)
+if err != cuda.CUresult.CUDA_SUCCESS:
+  raise RuntimeError("CUDA Error %s" % str(err))
+
+err, tensor_B_d = cuda.cuMemAlloc(tensor_B.size * tensor_B.itemsize)
+if err != cuda.CUresult.CUDA_SUCCESS:
+  raise RuntimeError("CUDA Error %s" % str(err))
+
+err, tensor_C_d = cuda.cuMemAlloc(tensor_C.size * tensor_C.itemsize)
+if err != cuda.CUresult.CUDA_SUCCESS:
+  raise RuntimeError("CUDA Error %s" % str(err))
+
+err, tensor_D_d = cuda.cuMemAlloc(tensor_D.size * tensor_D.itemsize)
+if err != cuda.CUresult.CUDA_SUCCESS:
+  raise RuntimeError("CUDA Error %s" % str(err))
+
+err, stream = cuda.cuStreamCreate(0)
+if err != cuda.CUresult.CUDA_SUCCESS:
+  raise RuntimeError("CUDA Error %s" % str(err))
+
+tensors = [
+  (tensor_A_d, tensor_A),
+  (tensor_B_d, tensor_B),
+  (tensor_C_d, tensor_C),
+  (tensor_D_d, tensor_D)
+]
+
+for tensor_device, tensor_host in tensors:
+  bytes = tensor_host.size * tensor_host.itemsize
+  print("Tensor has dimensions: %s (%d bytes)" % (str(tensor_host.size), tensor_host.itemsize))
+  err, = cuda.cuMemcpyHtoDAsync(tensor_device, tensor_host, bytes, stream)
+  print("updating tensor in device memory ", hex(int(tensor_device)))
+  if err != cuda.CUresult.CUDA_SUCCESS:
+    raise RuntimeError('CUDA Error %s' % str(err))
+
+#
+# Initialize a host buffer
+#
+
+arguments = rt.GemmArguments()
+
+arguments.problem_size = rt.GemmCoord(M, N, K)
+
+arguments.A = rt.TensorRef(tensor_A_d, M)
+arguments.B = rt.TensorRef(tensor_B_d, N)
+arguments.C = rt.TensorRef(tensor_C_d, M)
+arguments.D = rt.TensorRef(tensor_D_d, M)
+
+host_workspace = bytearray(gemm.get_host_workspace_size(arguments))
+device_workspace = None
+
+launch_config = gemm.plan(arguments)
+
+byte_count = gemm.initialize(host_workspace, device_workspace, launch_config, arguments)
+
+#
+# Launch the kernel
+#
+
+err = gemm.run(host_workspace, device_workspace, launch_config)
+
+if err != cuda.CUresult.CUDA_SUCCESS:
+  raise RuntimeError('CUDA Error %s' % str(err))
+
+#
+# Verify results
+#
+err, = cuda.cuStreamSynchronize(stream)
+
+if err != cuda.CUresult.CUDA_SUCCESS:
+  raise RuntimeError("CUDA Error %s" % str(err))
+
+
+#
+# Debug reporting of byte array contents
+#
+
+def PrintBytearray(host_workspace):
+  uint_str = None
+  prefix = None
+  print("uint32_t host_workspace[] = {")
+  for idx, byte in enumerate(host_workspace):
+    if not (idx % 4):
+      if uint_str is not None:
+        print(prefix, uint_str, ",")
+      prefix = "/* offset: %d B */    0x" % idx
+      uint_str = ""
+    uint_str = "{:02x}".format(byte) + uint_str
+  print("};")
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index eacfc245..d5fdac14 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -1,25 +1,33 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+
 set(CUTLASS_EXAMPLES_COMMON_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/common)
 
 add_custom_target(cutlass_examples)
@@ -102,7 +110,12 @@ foreach(EXAMPLE
   28_ampere_3xtf32_fast_accurate_tensorop_fprop
   29_ampere_3xtf32_fast_accurate_tensorop_complex_gemm
   30_wgrad_split_k
+  31_basic_syrk
+  32_basic_trmm
+  33_ampere_3xtf32_tensorop_symm
   34_transposed_conv2d
+  35_gemm_softmax
+  36_gather_scatter_fusion
   )
 
   add_subdirectory(${EXAMPLE})
diff --git a/include/cutlass/aligned_buffer.h b/include/cutlass/aligned_buffer.h
index a5742a68..f869d388 100644
--- a/include/cutlass/aligned_buffer.h
+++ b/include/cutlass/aligned_buffer.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/arch/arch.h b/include/cutlass/arch/arch.h
index 241ef568..578e6c14 100644
--- a/include/cutlass/arch/arch.h
+++ b/include/cutlass/arch/arch.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/arch/cache_operation.h b/include/cutlass/arch/cache_operation.h
index 75781c64..d84d4790 100644
--- a/include/cutlass/arch/cache_operation.h
+++ b/include/cutlass/arch/cache_operation.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/arch/memory.h b/include/cutlass/arch/memory.h
index 92d536f6..41a785bf 100644
--- a/include/cutlass/arch/memory.h
+++ b/include/cutlass/arch/memory.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -248,7 +254,7 @@ struct global_store<AccessType, 64> {
         "l"(((uint8_t *)ptr) + 32),
         "r"(data[2].x), "r"(data[2].y), "r"(data[2].z), "r"(data[2].w),
         "l"(((uint8_t *)ptr) + 48),
-        "r"(data[3].x), "r"(data[3].y), "r"(data[3].z), "r"(data[2].w));
+        "r"(data[3].x), "r"(data[3].y), "r"(data[3].z), "r"(data[3].w));
   }
 };
 
@@ -345,6 +351,114 @@ struct global_store<AccessType, 1> {
   }
 };
 
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// ld.shared
+template <int Bytes>
+void shared_load(void *dst, uint32_t ptr);
+
+/// ld.shared - 16b
+template <>
+CUTLASS_DEVICE
+void shared_load<2>(void *dst, uint32_t ptr) {
+  asm volatile("ld.shared.u16 %0, [%1];\n"
+    : "=h"(*reinterpret_cast<uint16_t *>(dst))
+    : "r"(ptr));
+}
+
+/// ld.shared - 32b
+template <>
+CUTLASS_DEVICE
+void shared_load<4>(void *dst, uint32_t ptr) {
+  asm volatile("ld.shared.u32 %0, [%1];\n"
+    : "=r"(*reinterpret_cast<uint32_t *>(dst))
+    : "r"(ptr));
+}
+
+/// ld.shared - 64b
+template <>
+CUTLASS_DEVICE
+void shared_load<8>(void *dst, uint32_t ptr) {
+  uint2 *dst_u64 = reinterpret_cast<uint2 *>(dst);
+  asm volatile("ld.shared.v2.u32 {%0, %1}, [%2];\n"
+    :
+      "=r"(dst_u64->x),
+      "=r"(dst_u64->y)
+    : "r"(ptr));
+}
+
+/// ld.shared - 128b
+template <>
+CUTLASS_DEVICE
+void shared_load<16>(void *dst, uint32_t ptr) {
+  uint4 *dst_u128 = reinterpret_cast<uint4 *>(dst);
+  asm volatile("ld.shared.v4.u32 {%0, %1, %2, %3}, [%4];\n"
+    :
+      "=r"(dst_u128->x),
+      "=r"(dst_u128->y),
+      "=r"(dst_u128->z),
+      "=r"(dst_u128->w)
+    : "r"(ptr));
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// st.shared
+template <int Bytes>
+void shared_store(uint32_t ptr, void const *src);
+
+/// st.shared - 16b
+template <>
+CUTLASS_DEVICE
+void shared_store<2>(uint32_t ptr, void const *src) {
+  asm volatile("st.shared.u16 [%0], %1;\n"
+    : :
+    "r"(ptr)
+    "h"(*reinterpret_cast<uint16_t const *>(src))
+  );
+}
+
+/// st.shared - 32b
+template <>
+CUTLASS_DEVICE
+void shared_store<4>(uint32_t ptr, void const *src) {
+  asm volatile("st.shared.u32 [%0], %1;\n"
+    : :
+    "r"(ptr)
+    "r"(*reinterpret_cast<uint32_t const  *>(src))
+  );
+}
+
+/// st.shared - 64b
+template <>
+CUTLASS_DEVICE
+void shared_store<8>(uint32_t ptr, void const *src) {
+  uint2 const *dst_u64 = reinterpret_cast<uint2 const *>(src);
+  asm volatile("st.shared.v2.u32 [%0], {%1, %2};\n"
+    : :
+      "r"(ptr)
+      "r"(dst_u64->x),
+      "r"(dst_u64->y)
+    );
+}
+
+/// st.shared - 128b
+template <>
+CUTLASS_DEVICE
+void shared_store<16>(uint32_t ptr, void const *src) {
+  uint4 const *dst_u128 = reinterpret_cast<uint4 const *>(src);
+  asm volatile("ld.shared.v4.u32 [%0], {%1, %2, %3, %4};\n"
+    : :
+      "r"(ptr)
+      "r"(dst_u128->x),
+      "r"(dst_u128->y),
+      "r"(dst_u128->z),
+      "r"(dst_u128->w)
+    );
+}
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 } // namespace arch
@@ -352,7 +466,7 @@ struct global_store<AccessType, 1> {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-#include "memory_sm75.h"  
+#include "memory_sm75.h"
 #include "memory_sm80.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/arch/memory_sm75.h b/include/cutlass/arch/memory_sm75.h
index 96b208f3..d97682ed 100644
--- a/include/cutlass/arch/memory_sm75.h
+++ b/include/cutlass/arch/memory_sm75.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/arch/memory_sm80.h b/include/cutlass/arch/memory_sm80.h
index 01820adf..48a499c5 100644
--- a/include/cutlass/arch/memory_sm80.h
+++ b/include/cutlass/arch/memory_sm80.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -80,6 +86,18 @@ template <
     CacheOperation::Kind cache_op = CacheOperation::Always>
 struct cp_async_nan;
 
+/// Either 0 or 1 are written to SMEM based on input element type
+/// Used for diagonal elements of triangular matrix of BLAS3 functions
+///
+/// STS
+///
+template <
+   /// Type of Element
+   typename Element,
+   /// If the data is for a Hermitian matrix diagonal
+   bool IsHermitianData = false>
+struct cp_async_diag;
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Partial specialization
@@ -204,6 +222,90 @@ struct cp_async_nan<16, CacheOperation::Always> {
   }
 };
 
+/// Partial specialization to write one (1)
+template<typename Element_>
+struct cp_async_diag <Element_, false> {
+  using Element = Element_;
+
+  CUTLASS_DEVICE
+  cp_async_diag(void *smem_ptr) {
+    #if CUDA_CP_ASYNC_ACTIVATED
+
+      /// Values for the diagonal elements of the triangular input matrix
+      static __constant__ uint2 DIAG_DATA_DOUBLE_ONE = {0x3ff00000, 0x00000000};
+      static __constant__ uint1 DIAG_DATA_FLOAT_ONE = {0x3f800000};
+      static __constant__ uint1 DIAG_DATA_ZERO = {0x00000000};
+
+      unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr);
+
+      if (platform::is_same<Element, complex<double>>::value) {
+        asm volatile("st.shared.v4.u32 [%0], {%1, %2, %3, %4};\n"
+                      : :
+                      "r"(smem_int_ptr), "r"(DIAG_DATA_DOUBLE_ONE.y), "r"(DIAG_DATA_DOUBLE_ONE.x),
+                      "r"(DIAG_DATA_ZERO.x), "r"(DIAG_DATA_ZERO.x));
+      } else if (platform::is_same<Element, complex<float>>::value) {
+        asm volatile("st.shared.v2.u32 [%0], {%1, %2};\n"
+                      : :
+                      "r"(smem_int_ptr), "r"(DIAG_DATA_FLOAT_ONE.x), "r"(DIAG_DATA_ZERO.x));
+      } else if (platform::is_same<Element, double>::value) {
+        asm volatile("st.shared.v2.u32 [%0], {%1, %2};\n"
+                      : :
+                      "r"(smem_int_ptr), "r"(DIAG_DATA_DOUBLE_ONE.y),"r"(DIAG_DATA_DOUBLE_ONE.x));
+      } else if (platform::is_same<Element, float>::value) {
+        asm volatile("st.shared.u32 [%0], %1;\n"
+                      : :
+                      "r"(smem_int_ptr), "r"(DIAG_DATA_FLOAT_ONE.x));
+      } else {
+        CUTLASS_UNUSED(smem_int_ptr);
+        CUTLASS_NOT_IMPLEMENTED();
+      }
+      
+    #else
+
+      CUTLASS_UNUSED(smem_ptr);
+      CUTLASS_NOT_IMPLEMENTED();
+
+    #endif
+  }
+};
+
+/// Partial specialization to write zero for the imaginary part of Hermitian data
+template<typename Element_>
+struct cp_async_diag <Element_, true> {
+  using Element = Element_;
+
+  CUTLASS_DEVICE
+  cp_async_diag(void *smem_ptr) {
+    #if CUDA_CP_ASYNC_ACTIVATED
+
+      /// Values for the diagonal elements of the triangular input matrix
+      static __constant__ uint1 DIAG_DATA_ZERO = {0x00000000};
+
+      unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr);
+
+      if (platform::is_same<Element, complex<double>>::value) {
+        asm volatile("st.shared.v2.u32 [%0], {%1, %2};\n"
+                      : :
+                      "r"(smem_int_ptr), "r"(DIAG_DATA_ZERO.x), "r"(DIAG_DATA_ZERO.x));
+      } else if (platform::is_same<Element, complex<float>>::value) {
+        asm volatile("st.shared.u32 [%0], %1;\n"
+                      : :
+                      "r"(smem_int_ptr), "r"(DIAG_DATA_ZERO.x));
+      } else {
+        CUTLASS_UNUSED(smem_int_ptr);
+        CUTLASS_NOT_IMPLEMENTED();
+      }
+      
+    #else
+
+      CUTLASS_UNUSED(smem_ptr);
+      CUTLASS_NOT_IMPLEMENTED();
+
+    #endif
+  }
+};
+
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Partial specialization
diff --git a/include/cutlass/arch/mma.h b/include/cutlass/arch/mma.h
index eefa3596..e79a4099 100644
--- a/include/cutlass/arch/mma.h
+++ b/include/cutlass/arch/mma.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/arch/mma_sm50.h b/include/cutlass/arch/mma_sm50.h
index b6d60cf7..96977c41 100644
--- a/include/cutlass/arch/mma_sm50.h
+++ b/include/cutlass/arch/mma_sm50.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/arch/mma_sm60.h b/include/cutlass/arch/mma_sm60.h
index 8cdba938..2d8f9ab9 100644
--- a/include/cutlass/arch/mma_sm60.h
+++ b/include/cutlass/arch/mma_sm60.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/arch/mma_sm61.h b/include/cutlass/arch/mma_sm61.h
index 860ac41d..274c0acb 100644
--- a/include/cutlass/arch/mma_sm61.h
+++ b/include/cutlass/arch/mma_sm61.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/arch/mma_sm70.h b/include/cutlass/arch/mma_sm70.h
index 2689595b..4fe862f4 100644
--- a/include/cutlass/arch/mma_sm70.h
+++ b/include/cutlass/arch/mma_sm70.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -112,6 +118,9 @@ struct Mma<
 
 #else
     assert(0);
+    #if defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+    #endif
 #endif
   }
 };
@@ -168,6 +177,9 @@ struct Mma<
 
 #else
     assert(0);
+    #if defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+    #endif
 #endif
   }
 };
@@ -224,6 +236,9 @@ struct Mma<
 
 #else
     assert(0);
+    #if defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+    #endif
 #endif
   }
 };
@@ -280,6 +295,9 @@ struct Mma<
 
 #else
     assert(0);
+    #if defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+    #endif
 #endif
   }
 };
@@ -362,6 +380,9 @@ struct Mma<
 
 #else
     assert(0);
+    #if defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+    #endif
 #endif
   }
 };
@@ -438,6 +459,9 @@ struct Mma<
 
 #else
     assert(0);
+    #if defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+    #endif
 #endif
   }
 };
@@ -514,6 +538,9 @@ struct Mma<
 
 #else
     assert(0);
+    #if defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+    #endif
 #endif
   }
 };
@@ -590,6 +617,9 @@ struct Mma<
 
 #else
     assert(0);
+    #if defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+    #endif
 #endif
   }
 };
diff --git a/include/cutlass/arch/mma_sm75.h b/include/cutlass/arch/mma_sm75.h
index d2a39574..816072bd 100644
--- a/include/cutlass/arch/mma_sm75.h
+++ b/include/cutlass/arch/mma_sm75.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/arch/mma_sm80.h b/include/cutlass/arch/mma_sm80.h
index a50ec049..5b9f5240 100644
--- a/include/cutlass/arch/mma_sm80.h
+++ b/include/cutlass/arch/mma_sm80.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/arch/mma_sparse_sm80.h b/include/cutlass/arch/mma_sparse_sm80.h
index 0c358f5f..e22d6006 100644
--- a/include/cutlass/arch/mma_sparse_sm80.h
+++ b/include/cutlass/arch/mma_sparse_sm80.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/arch/simd.h b/include/cutlass/arch/simd.h
index f9fe9969..abbabf94 100644
--- a/include/cutlass/arch/simd.h
+++ b/include/cutlass/arch/simd.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/arch/simd_sm60.h b/include/cutlass/arch/simd_sm60.h
index c27fb288..1de8d42b 100644
--- a/include/cutlass/arch/simd_sm60.h
+++ b/include/cutlass/arch/simd_sm60.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/arch/simd_sm61.h b/include/cutlass/arch/simd_sm61.h
index f0f40419..da236c16 100644
--- a/include/cutlass/arch/simd_sm61.h
+++ b/include/cutlass/arch/simd_sm61.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/arch/wmma.h b/include/cutlass/arch/wmma.h
index 6a90263a..13d0a9ed 100644
--- a/include/cutlass/arch/wmma.h
+++ b/include/cutlass/arch/wmma.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/arch/wmma_sm70.h b/include/cutlass/arch/wmma_sm70.h
index 3a234199..59d8eeed 100644
--- a/include/cutlass/arch/wmma_sm70.h
+++ b/include/cutlass/arch/wmma_sm70.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/arch/wmma_sm72.h b/include/cutlass/arch/wmma_sm72.h
index 2e321f0c..0895c86b 100644
--- a/include/cutlass/arch/wmma_sm72.h
+++ b/include/cutlass/arch/wmma_sm72.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/arch/wmma_sm75.h b/include/cutlass/arch/wmma_sm75.h
index 81febe21..c2c9e068 100644
--- a/include/cutlass/arch/wmma_sm75.h
+++ b/include/cutlass/arch/wmma_sm75.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/array.h b/include/cutlass/array.h
index ecca8913..347002f0 100644
--- a/include/cutlass/array.h
+++ b/include/cutlass/array.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -57,7 +63,7 @@ struct sizeof_bits<Array<T, N, RegisterSized> > {
 /// Returns true if the argument is a power of 2
 CUTLASS_HOST_DEVICE
 constexpr bool ispow2(unsigned x) {
-  return !(x & (x - 1));
+  return x && (!(x & (x - 1)));
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -65,7 +71,7 @@ constexpr bool ispow2(unsigned x) {
 /// Returns the largest power of two not greater than the argument.
 CUTLASS_HOST_DEVICE
 constexpr unsigned floor_pow_2(unsigned x) {
-  return ispow2(x) ? x : floor_pow_2(x >> 1);
+  return (x == 0 || ispow2(x)) ? x : ((floor_pow_2(x >> 1)) << 1);
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/array_planar_complex.h b/include/cutlass/array_planar_complex.h
index c030dc3a..0d8d2c98 100644
--- a/include/cutlass/array_planar_complex.h
+++ b/include/cutlass/array_planar_complex.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/array_subbyte.h b/include/cutlass/array_subbyte.h
index a0c59526..12b1665f 100644
--- a/include/cutlass/array_subbyte.h
+++ b/include/cutlass/array_subbyte.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -496,7 +502,17 @@ class Array<T, N, false> {
 
   CUTLASS_HOST_DEVICE
   void fill(T const &value) {
-    // TODO
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kElementsPerStoredItem; ++i) {
+      reference ref(storage, i);
+      ref = value;
+    }
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 1; i < kStorageElements; ++i) {
+      storage[i] = storage[0];
+    }
   }
 
   CUTLASS_HOST_DEVICE
diff --git a/include/cutlass/bfloat16.h b/include/cutlass/bfloat16.h
index 3c97bb17..e4d20efc 100644
--- a/include/cutlass/bfloat16.h
+++ b/include/cutlass/bfloat16.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/blas3.h b/include/cutlass/blas3.h
new file mode 100644
index 00000000..3c2df6dd
--- /dev/null
+++ b/include/cutlass/blas3.h
@@ -0,0 +1,175 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Basic include for CUTLASS BLAS3/HPC code.
+    
+  
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_types.h"
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Enumerated type describing the type of kernel (based on input or output matrices).
+enum class BlasMode {
+  kGemm,
+  kSymmetric,
+  kHermitian,
+  kTriangular,
+  kInvalid
+};
+
+/// Enumerated type describing the fill mode for matrices for BLAS functions.
+enum class FillMode {
+  kFull,              /// The entire tensor is covered.
+  kLower,             /// The 'lower' part of a tensor is covered including diagonal
+  kUpper,             /// The 'upper' part of a tensor is covered including diaognal
+  kDiagonal,          /// Only diagonal elements are covered.
+  kNone,              /// No element is covered.
+  kInvalid
+};
+
+/// Enumerated type describing the diagonal property of matrices for BLAS functions.
+enum class DiagType {
+  kNonUnit,
+  kUnit,
+  kZero, // Only used internally for computing SYMM/HEMM
+  kInvalid
+}; 
+
+/// Enumerated type describing the side dense matrix is in matrix equation for BLAS functions.
+enum class SideMode {
+  kLeft,
+  kRight,
+  kInvalid
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines FillMode inversions
+template <FillMode kFillMode>
+struct InvertFillMode;
+
+/// Invert FillMode lower to upper
+template <>
+struct InvertFillMode<FillMode::kLower> {
+  static FillMode const mode = FillMode::kUpper;
+};
+
+/// Invert FillMode upper to lower
+template <>
+struct InvertFillMode<FillMode::kUpper> {
+  static FillMode const mode = FillMode::kLower;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines SideMode inversions
+template <SideMode kSideMode>
+struct InvertSideMode;
+
+/// Invert SideMode left to right
+template <>
+struct InvertSideMode<SideMode::kLeft> {
+  static SideMode const mode = SideMode::kRight;
+};
+
+/// Invert SideMode right to left
+template <>
+struct InvertSideMode<SideMode::kRight> {
+  static SideMode const mode = SideMode::kLeft;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines correct compare operation for Triangular matrix boundary
+template <FillMode kFillMode, DiagType kDiagType = DiagType::kNonUnit>
+struct TrMatrixCompareOp {
+  using Index = int32_t;
+  using Type = typename platform::conditional<
+                        (kFillMode == FillMode::kLower), 
+                        greater_equal<Index>, 
+                        less_equal<Index>>::type;
+};
+
+template <FillMode kFillMode>
+struct TrMatrixCompareOp <kFillMode, DiagType::kUnit> {
+   using Index = int32_t;
+   using Type = typename platform::conditional<
+                        (kFillMode == FillMode::kLower), 
+                        greater_equal<Index>, 
+                        less_equal<Index>>::type;
+};
+
+template <FillMode kFillMode>
+struct TrMatrixCompareOp <kFillMode, DiagType::kZero> {
+   using Index = int32_t;
+   using Type = typename platform::conditional<
+                        (kFillMode == FillMode::kLower), 
+                        greater<Index>, 
+                        less<Index>>::type;
+};
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Returns precision in terms of bits (based on datatype) to fill tensors with.
+// Defaults to 5 bits of mantissa for TF32 and FP32 (with implicit round-offs).
+// Also defines acceptable mantissa result variance/error.
+template <typename Element>
+struct MantissaInBits {
+  static int constexpr bits = 5;
+  static double constexpr error = 1.0e-7;
+};
+
+// Full precision is supported for FP64
+template <>
+struct MantissaInBits<double> {
+  static int constexpr bits = 30;
+  static double constexpr error = 1.0e-15;
+};
+
+template <>
+struct MantissaInBits<cutlass::complex<double>> {
+  static int constexpr bits = 30;
+  static double constexpr error = 1.0e-15;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/include/cutlass/complex.h b/include/cutlass/complex.h
index 8ce5f9a3..8be2c494 100644
--- a/include/cutlass/complex.h
+++ b/include/cutlass/complex.h
@@ -1,34 +1,40 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 #pragma once
 
-#include <cuComplex.h>
 #if defined(__CUDACC_RTC__)
 #include <cuda/std/cstdint>
 #else
 #include <cstdint>
+#include <cuComplex.h>
 #endif
 
 #include "cutlass/cutlass.h"
@@ -77,6 +83,7 @@ struct InvertComplexTransform<ComplexTransform::kConjugate> {
 // Accessors for CUDA complex types
 //
 
+#if !defined(__CUDACC_RTC__)
 /// Returns the real part of the complex number
 CUTLASS_HOST_DEVICE
 float const &real(cuFloatComplex const &z) { return z.x; }
@@ -108,6 +115,7 @@ double const &imag(cuDoubleComplex const &z) { return z.y; }
 /// Returns the imaginary part of the complex number
 CUTLASS_HOST_DEVICE
 double &imag(cuDoubleComplex &z) { return z.y; }
+#endif
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -150,6 +158,8 @@ class complex
   CUTLASS_HOST_DEVICE
   complex(complex<A> const &z) : _real(static_cast<T>(z.real())), _imag(static_cast<T>(z.imag())) {}
 
+
+  #if !defined(__CUDACC_RTC__)
   /// Conversion from cuFloatComplex
   CUTLASS_HOST_DEVICE
   complex(cuFloatComplex const &z) : _real(static_cast<T>(cuCrealf(z))), _imag(static_cast<T>(cuCimagf(z))) {}
@@ -157,6 +167,7 @@ class complex
   /// Conversion from cuDoubleComplex
   CUTLASS_HOST_DEVICE
   complex(cuDoubleComplex const &z) : _real(static_cast<T>(cuCreal(z))), _imag(static_cast<T>(cuCimag(z))) {}
+  #endif
 
   /// Assignment
   template<typename A>
@@ -271,6 +282,8 @@ class complex
   CUTLASS_HOST_DEVICE
   T &imag() { return _imag; }
 
+
+  #if !defined(__CUDACC_RTC__)
   /// Converts to cuFloatComplex
   CUTLASS_HOST_DEVICE
   explicit operator cuFloatComplex() const { return make_cuFloatComplex(float(real()), float(imag())); }
@@ -278,6 +291,7 @@ class complex
   /// Converts to cuDoubleComplex
   CUTLASS_HOST_DEVICE
   explicit operator cuDoubleComplex() const { return make_cuDoubleComplex(real(), imag()); }
+  #endif
 };
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
@@ -479,6 +493,13 @@ CUTLASS_HOST_DEVICE complex<T> sin(complex<T> const &z) {
   return (exp(-z) - exp(z)) * complex<T>(T(0), T(1) / T(2));
 }
 
+/// Comparison 
+template <typename T>
+CUTLASS_HOST_DEVICE bool operator<(complex<T> const &lhs, complex<T> const &rhs) {
+  //TODO
+  return true; 
+}
+
 //////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Partial specialization for complex-valued type.
diff --git a/include/cutlass/constants.h b/include/cutlass/constants.h
index 1ef271f7..abb7cab4 100644
--- a/include/cutlass/constants.h
+++ b/include/cutlass/constants.h
@@ -1,25 +1,31 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.                               
- *                                                                                                  
- * Redistribution and use in source and binary forms, with or without modification, are permitted   
- * provided that the following conditions are met:                                                  
- *     * Redistributions of source code must retain the above copyright notice, this list of        
- *       conditions and the following disclaimer.                                                   
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of     
- *       conditions and the following disclaimer in the documentation and/or other materials        
- *       provided with the distribution.                                                            
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used   
- *       to endorse or promote products derived from this software without specific prior written   
- *       permission.                                                                                
- *                                                                                                  
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR   
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE  
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,   
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;  
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,    
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE   
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.                             
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *                                                                                                  
  **************************************************************************************************/
 
diff --git a/include/cutlass/conv/conv2d_problem_size.h b/include/cutlass/conv/conv2d_problem_size.h
index 90c59ca9..7a1bc92c 100644
--- a/include/cutlass/conv/conv2d_problem_size.h
+++ b/include/cutlass/conv/conv2d_problem_size.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -41,6 +47,13 @@
 
 #pragma once
 
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/cmath>
+#else
+#include <cmath>
+#endif
+
 #include "cutlass/cutlass.h"
 #include "cutlass/tensor_coord.h"
 #include "cutlass/fast_math.h"
@@ -348,29 +361,56 @@ CUTLASS_HOST_DEVICE
 int implicit_gemm_k_iterations(
   Operator conv_operator, 
   int threadblock_K, 
-  Conv2dProblemSize const &problem_size) {
+  Conv2dProblemSize const &problem_size,
+  IteratorAlgorithm algorithm = IteratorAlgorithm::kAnalytic) {
 
   int iterations = 0;
-  int elements_per_split_k_slice = 0;
 
-  switch (conv_operator) {
-  case Operator::kFprop:
-    elements_per_split_k_slice = (problem_size.C + problem_size.split_k_slices - 1) / problem_size.split_k_slices;
-    iterations = problem_size.R * problem_size.S * ((elements_per_split_k_slice + threadblock_K - 1) / threadblock_K);
-    break;
+  if (algorithm == IteratorAlgorithm::kFixedChannels) {
 
-  case Operator::kDgrad:
-    elements_per_split_k_slice = (problem_size.K + problem_size.split_k_slices - 1) / problem_size.split_k_slices;
-    iterations = problem_size.R * problem_size.S * ((elements_per_split_k_slice + threadblock_K - 1) / threadblock_K);
-    break;
+    int positions_per_iteration = threadblock_K / problem_size.C;
+    switch (conv_operator) {
+    case Operator::kFprop:
+      iterations = (problem_size.R * problem_size.S + positions_per_iteration - 1 ) / positions_per_iteration;
+      break;
 
-  case Operator::kWgrad:
-    elements_per_split_k_slice = (problem_size.N * problem_size.P * problem_size.Q + problem_size.split_k_slices - 1) / problem_size.split_k_slices;
-    iterations = (elements_per_split_k_slice + threadblock_K - 1) / threadblock_K;
-    break;
+    default:
+      break;
+    }
+  }
+  else if (algorithm == IteratorAlgorithm::kFewChannels) {
 
-  default:
-    break;
+    switch (conv_operator) {
+    case Operator::kFprop:
+      iterations = (problem_size.R * problem_size.S * problem_size.C + threadblock_K - 1 ) / threadblock_K;
+      break;
+
+    default:
+      break;
+    }
+  }
+  else {
+    int elements_per_split_k_slice = 0;
+
+    switch (conv_operator) {
+    case Operator::kFprop:
+      elements_per_split_k_slice = (problem_size.C + problem_size.split_k_slices - 1) / problem_size.split_k_slices;
+      iterations = problem_size.R * problem_size.S * ((elements_per_split_k_slice + threadblock_K - 1) / threadblock_K);
+      break;
+
+    case Operator::kDgrad:
+      elements_per_split_k_slice = (problem_size.K + problem_size.split_k_slices - 1) / problem_size.split_k_slices;
+      iterations = problem_size.R * problem_size.S * ((elements_per_split_k_slice + threadblock_K - 1) / threadblock_K);
+      break;
+
+    case Operator::kWgrad:
+      elements_per_split_k_slice = (problem_size.N * problem_size.P * problem_size.Q + problem_size.split_k_slices - 1) / problem_size.split_k_slices;
+      iterations = (elements_per_split_k_slice + threadblock_K - 1) / threadblock_K;
+      break;
+
+    default:
+      break;
+    }
   }
 
   return iterations;
@@ -499,12 +539,12 @@ void strided_dgrad_starting_coords(
 
   // start_h  = std::abs(problem_size.stride_h - ((problem_size.pad_h % problem_size.stride_h) - r)) % problem_size.stride_h;
   stride_h_divmod.divmod(pad_h_rem_, problem_size.pad_h);
-  int r_ = std::abs(problem_size.stride_h - (pad_h_rem_ - r));
+  int r_ = absolute_value(problem_size.stride_h - (pad_h_rem_ - r));
   stride_h_divmod.divmod(start_h, r_);
 
   //start_w  = std::abs(problem_size.stride_w - ((problem_size.pad_w % problem_size.stride_w) - s)) % problem_size.stride_w;
   stride_w_divmod.divmod(pad_w_rem_, problem_size.pad_w);
-  int s_ = std::abs(problem_size.stride_w - (pad_w_rem_ - s));
+  int s_ = absolute_value(problem_size.stride_w - (pad_w_rem_ - s));
   stride_w_divmod.divmod(start_w, s_);
 }
 
diff --git a/include/cutlass/conv/conv3d_problem_size.h b/include/cutlass/conv/conv3d_problem_size.h
index 5ee3bb75..82ea1cef 100644
--- a/include/cutlass/conv/conv3d_problem_size.h
+++ b/include/cutlass/conv/conv3d_problem_size.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -332,7 +338,8 @@ CUTLASS_HOST_DEVICE
 int implicit_gemm_k_iterations(
   Operator conv_operator, 
   int threadblock_K, 
-  Conv3dProblemSize const &problem_size) {
+  Conv3dProblemSize const &problem_size,
+  IteratorAlgorithm algorithm = IteratorAlgorithm::kAnalytic) {
 
   int iterations = 0;
   int elements_per_split_k_slice = 0;
diff --git a/include/cutlass/conv/convolution.h b/include/cutlass/conv/convolution.h
index fccb807b..52a4636c 100644
--- a/include/cutlass/conv/convolution.h
+++ b/include/cutlass/conv/convolution.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -92,7 +98,9 @@ enum class Mode {
 /// Selects among several implementation variants trading off performance with simplicity
 enum class IteratorAlgorithm { 
   kAnalytic,      ///< functionally correct in all cases but lower performance
-  kOptimized      ///< optimized for R <= 32, S <= 32 and unity-stride dgrad
+  kOptimized,     ///< optimized for R <= 32, S <= 32 and unity-stride dgrad
+  kFixedChannels, ///< Analytic algorithm optimized for fixed channel count (C == AccessSize)
+  kFewChannels    ///< Analytic algorithm optimized for few channels (C divisible by AccessSize)
 };
 
 /// Distinguishes among partial specializations that accelerate certain problems where convolution
diff --git a/include/cutlass/conv/device/implicit_gemm_convolution.h b/include/cutlass/conv/device/implicit_gemm_convolution.h
index da4c8029..8e87ec56 100644
--- a/include/cutlass/conv/device/implicit_gemm_convolution.h
+++ b/include/cutlass/conv/device/implicit_gemm_convolution.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/conv/device/implicit_gemm_convolution_fusion.h b/include/cutlass/conv/device/implicit_gemm_convolution_fusion.h
index 1b481055..b0c590ad 100644
--- a/include/cutlass/conv/device/implicit_gemm_convolution_fusion.h
+++ b/include/cutlass/conv/device/implicit_gemm_convolution_fusion.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/conv/kernel/default_conv2d.h b/include/cutlass/conv/kernel/default_conv2d.h
index 5a83586e..00cff237 100644
--- a/include/cutlass/conv/kernel/default_conv2d.h
+++ b/include/cutlass/conv/kernel/default_conv2d.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -37,10 +43,8 @@
 #include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
 #include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
 #include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
-
 #include "cutlass/epilogue/threadblock/default_epilogue_with_broadcast.h"
 #include "cutlass/epilogue/threadblock/default_epilogue_with_reduction.h"
-
 #include "cutlass/conv/convolution.h"
 #include "cutlass/conv/threadblock/conv2d_tile_iterator.h"
 #include "cutlass/conv/threadblock/implicit_gemm_pipelined.h"
@@ -101,7 +105,6 @@ struct DefaultConvEpilogue<
   >::Epilogue;
 };
 
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 template <
@@ -215,7 +218,6 @@ struct DefaultConvEpilogueWithReductionTensorOp<
   >::Epilogue;
 };
 
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 // Defaults for strided Dgrad
diff --git a/include/cutlass/conv/kernel/default_conv2d_dgrad.h b/include/cutlass/conv/kernel/default_conv2d_dgrad.h
index c51d9d55..54706871 100644
--- a/include/cutlass/conv/kernel/default_conv2d_dgrad.h
+++ b/include/cutlass/conv/kernel/default_conv2d_dgrad.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -77,7 +83,7 @@ template <
 //                               OpClassTensorOp convolutions 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Defines a kernel for Conv2dDgrad specialzation for Analytic IteratorAlgorithm Dgrad Strided and
+/// Defines a kernel for Conv2dDgrad specialization for Analytic IteratorAlgorithm Dgrad Strided and
 // multistage pipeline.
 template <
   typename ElementA,
@@ -197,7 +203,7 @@ struct DefaultConv2dDgrad <
   >;
 };
 
-/// Defines a kernel for Conv2dDgrad specialzation for Analytic IteratorAlgorithm Dgrad Strided
+/// Defines a kernel for Conv2dDgrad specialization for Analytic IteratorAlgorithm Dgrad Strided
 // and 2 stage pipeline.
 template <
   typename ElementA,
@@ -316,7 +322,7 @@ struct DefaultConv2dDgrad <
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Defines a kernel for Conv2dDgrad specialzation for Analytic IteratorAlgorithm Dgrad Unity Strided
+/// Defines a kernel for Conv2dDgrad specialization for Analytic IteratorAlgorithm Dgrad Unity Strided
 // and multistage pipeline.
 template <
   typename ElementA,
@@ -436,7 +442,7 @@ struct DefaultConv2dDgrad <
   >;
 };
 
-/// Defines a kernel for Conv2dDgrad specialzation for Analytic IteratorAlgorithm Dgrad Unity
+/// Defines a kernel for Conv2dDgrad specialization for Analytic IteratorAlgorithm Dgrad Unity
 // 2 stage pipeline.
 template <
   typename ElementA,
@@ -555,7 +561,7 @@ struct DefaultConv2dDgrad <
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Defines a kernel for Conv2dDgrad specialzation for optimized IteratorAlgorithm Dgrad Unity Strided
+/// Defines a kernel for Conv2dDgrad specialization for optimized IteratorAlgorithm Dgrad Unity Strided
 // and multistage pipeline.
 template <
   typename ElementA,
@@ -675,7 +681,7 @@ struct DefaultConv2dDgrad <
   >;
 };
 
-/// Defines a kernel for Conv2dDgrad specialzation for Optimized IteratorAlgorithm Dgrad Strided and
+/// Defines a kernel for Conv2dDgrad specialization for Optimized IteratorAlgorithm Dgrad Strided and
 // multistage pipeline.
 template <
   typename ElementA,
@@ -795,7 +801,7 @@ struct DefaultConv2dDgrad <
   >;
 };
 
-/// Defines a kernel for Conv2dDgrad specialzation for Optimized IteratorAlgorithm Dgrad Strided
+/// Defines a kernel for Conv2dDgrad specialization for Optimized IteratorAlgorithm Dgrad Strided
 // and 2 stage pipeline.
 template <
   typename ElementA,
@@ -912,7 +918,7 @@ struct DefaultConv2dDgrad <
   >;
 };
 
-/// Defines a kernel for Conv2dDgrad specialzation for Optimized IteratorAlgorithm Dgrad Unity
+/// Defines a kernel for Conv2dDgrad specialization for Optimized IteratorAlgorithm Dgrad Unity
 // 2 stage pipeline
 template <
   typename ElementA,
@@ -1032,7 +1038,7 @@ struct DefaultConv2dDgrad <
 /////////////////////////////////////////////////////////////////////////////////////////////////
 //                            OpClassSimt convolutions 
 /////////////////////////////////////////////////////////////////////////////////////////////////
-/// Defines a kernel for Conv2dDgrad specialzation for Analytic IteratorAlgorithm, 
+/// Defines a kernel for Conv2dDgrad specialization for Analytic IteratorAlgorithm, 
 /// multi-stage pipeline, and FFMA-based mainloop for SM80
 
 template <
@@ -1253,7 +1259,7 @@ struct DefaultConv2dDgrad <
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Defines a kernel for Conv2dDgrad specialzation for Optimized IteratorAlgorithm, 
+/// Defines a kernel for Conv2dDgrad specialization for Optimized IteratorAlgorithm, 
 /// multi-stage pipeline, and FFMA-based mainloop for SM80
 
 template <
@@ -1471,7 +1477,7 @@ struct DefaultConv2dDgrad <
 };
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Defines a kernel for Conv2dDgrad specialzation for Analytic IteratorAlgorithm, 
+/// Defines a kernel for Conv2dDgrad specialization for Analytic IteratorAlgorithm, 
 /// 2 stage pipeline, and FFMA-based mainloop for SM50
 template <
   typename ElementA,
@@ -1693,7 +1699,7 @@ struct DefaultConv2dDgrad <
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Defines a kernel for Conv2dDgrad specialzation for Optimized IteratorAlgorithm, 
+/// Defines a kernel for Conv2dDgrad specialization for Optimized IteratorAlgorithm, 
 /// 2 stage pipeline, and FFMA-based mainloop for SM50
 template <
   typename ElementA,
diff --git a/include/cutlass/conv/kernel/default_conv2d_fprop.h b/include/cutlass/conv/kernel/default_conv2d_fprop.h
index 2ce945d2..859f2396 100644
--- a/include/cutlass/conv/kernel/default_conv2d_fprop.h
+++ b/include/cutlass/conv/kernel/default_conv2d_fprop.h
@@ -1,25 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
-namespace conv {
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -36,9 +41,14 @@ namespace conv {
 #include "cutlass/conv/kernel/default_conv2d.h"
 
 #include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h"
-#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h"
 #include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_fixed_channels.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_few_channels.h"
+
+#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h"
 #include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_fixed_channels.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_few_channels.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -77,7 +87,7 @@ template <
 //                         OpClassTensorOp convolutions 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Defines a kernel for Conv2dFprop specialzation for Analytic IteratorAlgorithm and multistage 
+/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm and multistage 
 /// pipeline.
 template <
   typename ElementA,
@@ -198,7 +208,490 @@ struct DefaultConv2dFprop <
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Defines a kernel for Conv2dFprop specialzation for Analytic IteratorAlgorithm and multistage 
+/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm and multistage
+/// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kFixedChannels,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorFixedChannels<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA, LayoutA,
+      ThreadMapA,
+      AccessTypeA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorFixedChannels<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB, LayoutB,
+      ThreadMapB,
+      AccessTypeB
+    >;
+
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * AlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    CacheOpB,
+    MmaPolicy,
+    Stages
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm and two stage
+/// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kFixedChannels,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorFixedChannels<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA, LayoutA,
+        ThreadMapA,
+        AccessTypeA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorFixedChannels<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB, LayoutB,
+        ThreadMapB,
+        AccessTypeB
+      >
+    >;
+
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm and multistage
+/// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kFewChannels,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorFewChannels<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA, LayoutA,
+      ThreadMapA,
+      AccessTypeA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorFewChannels<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB, LayoutB,
+      ThreadMapB,
+      AccessTypeB
+    >;
+
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * AlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    CacheOpB,
+    MmaPolicy,
+    Stages
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm and multistage
+/// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kFewChannels,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorFewChannels<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA, LayoutA,
+        ThreadMapA,
+        AccessTypeA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorFewChannels<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB, LayoutB,
+        ThreadMapB,
+        AccessTypeB
+      >
+    >;
+
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * AlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm and multistage 
 /// pipeline with interleaved layout.
 template <
   typename ElementA,
@@ -321,7 +814,7 @@ struct DefaultConv2dFprop <
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Defines a kernel for Conv2dFprop specialzation for Analytic IteratorAlgorithm
+/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm
 /// and 2 stage pipeline.
 template <
   typename ElementA,
@@ -439,7 +932,7 @@ struct DefaultConv2dFprop <
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Defines a kernel for Conv2dFprop specialzation for Analytic IteratorAlgorithm and 2 stage 
+/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm and 2 stage 
 /// pipeline with interleaved layout.
 template <
   typename ElementA,
@@ -564,7 +1057,7 @@ struct DefaultConv2dFprop <
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Defines a kernel for Conv2dFprop specialzation for Optimzed IteratorAlgorithm and 
+/// Defines a kernel for Conv2dFprop specialization for Optimzed IteratorAlgorithm and 
 /// multistage pipeline.
 template <
   typename ElementA,
@@ -688,7 +1181,7 @@ struct DefaultConv2dFprop <
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Defines a kernel for Conv2dFprop specialzation for Optimzed IteratorAlgorithm and 
+/// Defines a kernel for Conv2dFprop specialization for Optimzed IteratorAlgorithm and 
 // multistage pipeline with interleaved layout.
 template <
   typename ElementA,
@@ -803,7 +1296,7 @@ struct DefaultConv2dFprop <
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Defines a kernel for Conv2dFprop specialzation for Optimized IteratorAlgorithm
+/// Defines a kernel for Conv2dFprop specialization for Optimized IteratorAlgorithm
 /// and 2 stage pipeline.
 template <
   typename ElementA,
@@ -923,7 +1416,7 @@ struct DefaultConv2dFprop <
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Defines a kernel for Conv2dFprop specialzation for Optimized IteratorAlgorithm and 2 stage 
+/// Defines a kernel for Conv2dFprop specialization for Optimized IteratorAlgorithm and 2 stage 
 /// pipeline with interleaved layout.
 template <
   typename ElementA,
@@ -1039,7 +1532,7 @@ struct DefaultConv2dFprop <
 /////////////////////////////////////////////////////////////////////////////////////////////////
 //                            OpClassSimt convolutions
 /////////////////////////////////////////////////////////////////////////////////////////////////
-/// Defines a kernel for Conv2dFprop specialzation for Analytic IteratorAlgorithm, 
+/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm, 
 /// multi-stage pipeline, and FFMA-based mainloop for SM80
 
 template <
@@ -1150,7 +1643,7 @@ struct DefaultConv2dFprop <
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Defines a kernel for Conv2dFprop specialzation for Optimized IteratorAlgorithm, 
+/// Defines a kernel for Conv2dFprop specialization for Optimized IteratorAlgorithm, 
 /// multi-stage pipeline, and FFMA-based mainloop for SM80
 
 template <
@@ -1263,7 +1756,7 @@ struct DefaultConv2dFprop <
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Defines a kernel for Conv2dFprop specialzation for Analytic IteratorAlgorithm, 
+/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm, 
 /// 2 stage pipeline, and FFMA-based mainloop for SM50
 template <
   typename ElementA,
@@ -1375,7 +1868,7 @@ struct DefaultConv2dFprop <
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Defines a kernel for Conv2dFprop specialzation for Optimized IteratorAlgorithm, 
+/// Defines a kernel for Conv2dFprop specialization for Optimized IteratorAlgorithm, 
 /// 2 stage pipeline, and FFMA-based mainloop for SM50
 template <
   typename ElementA,
diff --git a/include/cutlass/conv/kernel/default_conv2d_fprop_fusion.h b/include/cutlass/conv/kernel/default_conv2d_fprop_fusion.h
index 142b64c3..4fc2200a 100644
--- a/include/cutlass/conv/kernel/default_conv2d_fprop_fusion.h
+++ b/include/cutlass/conv/kernel/default_conv2d_fprop_fusion.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -77,7 +83,7 @@ template <
 //                         OpClassTensorOp convolutions 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Defines a kernel for Conv2dFprop specialzation for Analytic IteratorAlgorithm and multistage 
+/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm and multistage 
 /// pipeline.
 template <
   typename ElementA,
@@ -209,7 +215,7 @@ struct DefaultConv2dFpropFusion <
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Defines a kernel for Conv2dFprop specialzation for Optimzed IteratorAlgorithm and 
+/// Defines a kernel for Conv2dFprop specialization for Optimzed IteratorAlgorithm and 
 /// multistage pipeline.
 template <
   typename ElementA,
diff --git a/include/cutlass/conv/kernel/default_conv2d_fprop_with_broadcast.h b/include/cutlass/conv/kernel/default_conv2d_fprop_with_broadcast.h
index 52c403a5..187c46f9 100644
--- a/include/cutlass/conv/kernel/default_conv2d_fprop_with_broadcast.h
+++ b/include/cutlass/conv/kernel/default_conv2d_fprop_with_broadcast.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *   * Redistributions of source code must retain the above copyright notice, this list of
- *     conditions and the following disclaimer.
- *   * Redistributions in binary form must reproduce the above copyright notice, this list of
- *     conditions and the following disclaimer in the documentation and/or other materials
- *     provided with the distribution.
- *   * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *     to endorse or promote products derived from this software without specific prior written
- *     permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/conv/kernel/default_conv2d_fprop_with_reduction.h b/include/cutlass/conv/kernel/default_conv2d_fprop_with_reduction.h
index b092b1a3..82bbd252 100644
--- a/include/cutlass/conv/kernel/default_conv2d_fprop_with_reduction.h
+++ b/include/cutlass/conv/kernel/default_conv2d_fprop_with_reduction.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *   * Redistributions of source code must retain the above copyright notice, this list of
- *     conditions and the following disclaimer.
- *   * Redistributions in binary form must reproduce the above copyright notice, this list of
- *     conditions and the following disclaimer in the documentation and/or other materials
- *     provided with the distribution.
- *   * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *     to endorse or promote products derived from this software without specific prior written
- *     permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/conv/kernel/default_conv2d_wgrad.h b/include/cutlass/conv/kernel/default_conv2d_wgrad.h
index 677959a2..c5e5b3c5 100644
--- a/include/cutlass/conv/kernel/default_conv2d_wgrad.h
+++ b/include/cutlass/conv/kernel/default_conv2d_wgrad.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -80,7 +86,7 @@ template <
 //                          OpClassTensorOp convolutions
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Defines a kernel for Conv2dWgrad specialzation for Analytic IteratorAlgorithm and multistage 
+/// Defines a kernel for Conv2dWgrad specialization for Analytic IteratorAlgorithm and multistage 
 // pipeline.
 template <
   typename ElementA,
@@ -197,7 +203,7 @@ struct DefaultConv2dWgrad <
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Defines a kernel for Conv2dWgrad specialzation for Analytic IteratorAlgorithm and two 
+/// Defines a kernel for Conv2dWgrad specialization for Analytic IteratorAlgorithm and two 
 // pipeline.
 template <
   typename ElementA,
@@ -316,7 +322,7 @@ struct DefaultConv2dWgrad <
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Defines a kernel for Conv2dWgrad specialzation for Optimized IteratorAlgorithm and multistage 
+/// Defines a kernel for Conv2dWgrad specialization for Optimized IteratorAlgorithm and multistage 
 // pipeline.
 template <
   typename ElementA,
@@ -433,7 +439,7 @@ struct DefaultConv2dWgrad <
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Defines a kernel for Conv2dWgrad specialzation for Optimized IteratorAlgorithm and two 
+/// Defines a kernel for Conv2dWgrad specialization for Optimized IteratorAlgorithm and two 
 // pipeline.
 template <
   typename ElementA,
@@ -553,7 +559,7 @@ struct DefaultConv2dWgrad <
 /////////////////////////////////////////////////////////////////////////////////////////////////
 //                         OpClassSimt convolutions
 /////////////////////////////////////////////////////////////////////////////////////////////////
-/// Defines a kernel for Conv2dWgrad specialzation for Analytic IteratorAlgorithm, 
+/// Defines a kernel for Conv2dWgrad specialization for Analytic IteratorAlgorithm, 
 /// multi-stage pipeline, and FFMA-based mainloop for SM80
 
 template <
@@ -664,7 +670,7 @@ struct DefaultConv2dWgrad <
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Defines a kernel for Conv2dWgrad specialzation for Optimized IteratorAlgorithm, 
+/// Defines a kernel for Conv2dWgrad specialization for Optimized IteratorAlgorithm, 
 /// multi-stage pipeline, and FFMA-based mainloop for SM80
 
 template <
@@ -775,7 +781,7 @@ struct DefaultConv2dWgrad <
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Defines a kernel for Conv2dWgrad specialzation for Analytic IteratorAlgorithm, 
+/// Defines a kernel for Conv2dWgrad specialization for Analytic IteratorAlgorithm, 
 /// 2 stage pipeline, and FFMA-based mainloop for SM50
 template <
   typename ElementA,
@@ -886,7 +892,7 @@ struct DefaultConv2dWgrad <
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Defines a kernel for Conv2dWgrad specialzation for Optimized IteratorAlgorithm, 
+/// Defines a kernel for Conv2dWgrad specialization for Optimized IteratorAlgorithm, 
 /// 2 stage pipeline, and FFMA-based mainloop for SM50
 template <
   typename ElementA,
diff --git a/include/cutlass/conv/kernel/default_conv2d_wgrad_fusion.h b/include/cutlass/conv/kernel/default_conv2d_wgrad_fusion.h
index 5aed546d..e43e02cd 100644
--- a/include/cutlass/conv/kernel/default_conv2d_wgrad_fusion.h
+++ b/include/cutlass/conv/kernel/default_conv2d_wgrad_fusion.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -78,7 +84,7 @@ template <
 //                          OpClassTensorOp convolutions
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Defines a kernel for Conv2dWgrad specialzation for Analytic IteratorAlgorithm and multistage 
+/// Defines a kernel for Conv2dWgrad specialization for Analytic IteratorAlgorithm and multistage 
 // pipeline.
 template <
   typename ElementA,
@@ -195,7 +201,7 @@ struct DefaultConv2dWgradFusion <
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Defines a kernel for Conv2dWgrad specialzation for Optimized IteratorAlgorithm and multistage 
+/// Defines a kernel for Conv2dWgrad specialization for Optimized IteratorAlgorithm and multistage 
 // pipeline.
 template <
   typename ElementA,
diff --git a/include/cutlass/conv/kernel/default_conv3d_dgrad.h b/include/cutlass/conv/kernel/default_conv3d_dgrad.h
index 929165cf..8f408ef0 100644
--- a/include/cutlass/conv/kernel/default_conv3d_dgrad.h
+++ b/include/cutlass/conv/kernel/default_conv3d_dgrad.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -70,7 +76,7 @@ template <
   conv::StrideSupport StrideSupport = StrideSupport::kStrided
 > struct DefaultConv3dDgrad;
 
-/// Defines a kernel for Conv3dDgrad specialzation for Analytic IteratorAlgorithm Dgrad Strided
+/// Defines a kernel for Conv3dDgrad specialization for Analytic IteratorAlgorithm Dgrad Strided
 // and multistage pipeline.
 template <
   typename ElementA,
@@ -179,7 +185,7 @@ struct DefaultConv3dDgrad <
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Defines a kernel for Conv3dDgrad specialzation for Optimized IteratorAlgorithm Dgrad Strided
+/// Defines a kernel for Conv3dDgrad specialization for Optimized IteratorAlgorithm Dgrad Strided
 // and multistage pipeline.
 template <
   typename ElementA,
diff --git a/include/cutlass/conv/kernel/default_conv3d_fprop.h b/include/cutlass/conv/kernel/default_conv3d_fprop.h
index ef63ffde..7b20ae34 100644
--- a/include/cutlass/conv/kernel/default_conv3d_fprop.h
+++ b/include/cutlass/conv/kernel/default_conv3d_fprop.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -179,7 +185,7 @@ struct DefaultConv3dFprop <
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Defines a kernel for Conv2dFprop specialzation for Analytic IteratorAlgorithm and multistage
+/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm and multistage
 // pipeline.
 template <
   typename ElementA,
@@ -393,7 +399,7 @@ struct DefaultConv3dFprop <
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Defines a kernel for Conv3dFprop specialzation for Optimized IteratorAlgorithm and multistage
+/// Defines a kernel for Conv3dFprop specialization for Optimized IteratorAlgorithm and multistage
 // pipeline.
 template <
   typename ElementA,
diff --git a/include/cutlass/conv/kernel/default_conv3d_wgrad.h b/include/cutlass/conv/kernel/default_conv3d_wgrad.h
index 0fd7f0f8..46dc3920 100644
--- a/include/cutlass/conv/kernel/default_conv3d_wgrad.h
+++ b/include/cutlass/conv/kernel/default_conv3d_wgrad.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -71,7 +77,7 @@ template <
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Defines a kernel for Conv3dWgrad specialzation for Analytic IteratorAlgorithm and multistage 
+/// Defines a kernel for Conv3dWgrad specialization for Analytic IteratorAlgorithm and multistage 
 // pipeline.
 template <
   typename ElementA,
@@ -176,7 +182,7 @@ struct DefaultConv3dWgrad <
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
-/// Defines a kernel for Conv3dWgrad specialzation for Analytic IteratorAlgorithm and two 
+/// Defines a kernel for Conv3dWgrad specialization for Analytic IteratorAlgorithm and two 
 // pipeline.
 template <
   typename ElementA,
@@ -284,7 +290,7 @@ struct DefaultConv3dWgrad <
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Defines a kernel for Conv3dWgrad specialzation for Optimized IteratorAlgorithm and multistage 
+/// Defines a kernel for Conv3dWgrad specialization for Optimized IteratorAlgorithm and multistage 
 // pipeline.
 template <
   typename ElementA,
@@ -389,7 +395,7 @@ struct DefaultConv3dWgrad <
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
-/// Defines a kernel for Conv3dWgrad specialzation for Optimized IteratorAlgorithm and two 
+/// Defines a kernel for Conv3dWgrad specialization for Optimized IteratorAlgorithm and two 
 // pipeline.
 template <
   typename ElementA,
diff --git a/include/cutlass/conv/kernel/implicit_gemm_convolution.h b/include/cutlass/conv/kernel/implicit_gemm_convolution.h
index 28658541..4c5aa9db 100644
--- a/include/cutlass/conv/kernel/implicit_gemm_convolution.h
+++ b/include/cutlass/conv/kernel/implicit_gemm_convolution.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -231,7 +237,11 @@ struct ImplicitGemmConvolution {
       semaphore(semaphore),
       split_k_mode(args.split_k_mode)
     {
-      gemm_k_iterations = implicit_gemm_k_iterations(kConvolutionalOperator, ThreadblockShape::kK, args.problem_size);
+      gemm_k_iterations = implicit_gemm_k_iterations(
+        kConvolutionalOperator,
+        ThreadblockShape::kK,
+        args.problem_size,
+        kIteratorAlgorithm);
 
       ThreadblockSwizzle threadblock_swizzle;
 
@@ -424,4 +434,3 @@ struct ImplicitGemmConvolution {
 } // namespace cutlass
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
-
diff --git a/include/cutlass/conv/kernel/implicit_gemm_convolution_fusion.h b/include/cutlass/conv/kernel/implicit_gemm_convolution_fusion.h
index 8d961048..97bad541 100644
--- a/include/cutlass/conv/kernel/implicit_gemm_convolution_fusion.h
+++ b/include/cutlass/conv/kernel/implicit_gemm_convolution_fusion.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/conv/kernel/implicit_gemm_convolution_strided_dgrad.h b/include/cutlass/conv/kernel/implicit_gemm_convolution_strided_dgrad.h
index d9529311..62fd9b77 100644
--- a/include/cutlass/conv/kernel/implicit_gemm_convolution_strided_dgrad.h
+++ b/include/cutlass/conv/kernel/implicit_gemm_convolution_strided_dgrad.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/conv/kernel/implicit_gemm_convolution_with_fused_epilogue.h b/include/cutlass/conv/kernel/implicit_gemm_convolution_with_fused_epilogue.h
index 1b5716e8..1dc7db02 100644
--- a/include/cutlass/conv/kernel/implicit_gemm_convolution_with_fused_epilogue.h
+++ b/include/cutlass/conv/kernel/implicit_gemm_convolution_with_fused_epilogue.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -489,4 +495,3 @@ struct ImplicitGemmConvolutionWithFusedEpilogue {
 } // namespace cutlass
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
-
diff --git a/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_analytic.h
index 59b5c8bd..0af34bab 100644
--- a/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_analytic.h
+++ b/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_analytic.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_optimized.h
index ba7b839b..4014173b 100644
--- a/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_optimized.h
+++ b/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_optimized.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_analytic.h
index 8434b398..80448f36 100644
--- a/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_analytic.h
+++ b/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_analytic.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_optimized.h
index 99e72adb..4b1e906a 100644
--- a/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_optimized.h
+++ b/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_optimized.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h
index 9b2296b7..add089af 100644
--- a/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h
+++ b/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_few_channels.h b/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_few_channels.h
new file mode 100644
index 00000000..d95f3758
--- /dev/null
+++ b/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_few_channels.h
@@ -0,0 +1,360 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM A (activation tile)
+    matrix from memory.
+
+    This iterator assumes TensorNHWC or TensorNCxHWx<Interleave> layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad).
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/threadblock/conv2d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename Layout_,
+  typename ThreadMap_,
+  typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess>
+>
+class Conv2dFpropActivationTileAccessIteratorFewChannels {
+public:
+
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = Layout_;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kFewChannels;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+  static int const kPositionsPerTile = Shape::kColumn;
+
+  static int const kAccessesPerVector = kElementsPerAccess / AccessType::kElements;
+
+  static bool const kUseFastDivmodPrologue = true;
+  static bool const kUseFastDivmodMainloop = true;
+
+  static int const kStrideH = 0;
+  static int const kStrideW = 0;
+  static int const kDilationH = 0;
+  static int const kDilationW = 0;
+
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements),
+    "Vectors implied by the thread map must be divisible by the access type.");
+
+  //
+  // Simplifying assertions
+  //
+  static_assert(ThreadMap::Iterations::kContiguous == 1,
+    "Require Iterations::kContiguous == 1");
+
+  //
+  // Parameters structure
+  //
+
+  using Params = Conv2dFewChannelsParams<Layout>;
+
+private:
+
+  Params const &params_;
+  Conv2dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  LongIndex iteration_vector_;
+  char const *pointer_;
+
+  int rsc_index_;
+  int offset_n_[ThreadMap::Iterations::kStrided];
+  int offset_p_[ThreadMap::Iterations::kStrided];
+  int offset_q_[ThreadMap::Iterations::kStrided];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropActivationTileAccessIteratorFewChannels(
+    Params const &params,
+    Conv2dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()       // tile index - units are threadblock-scoped tiles
+  ):
+    params_(params),
+    problem_size_(problem_size),
+    pointer_(reinterpret_cast<char const *>(ptr)),
+    rsc_index_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    rsc_index_ = (threadblock_offset.column() + thread_coord.contiguous());
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      int offset_npq = threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
+
+      if (kUseFastDivmodPrologue) {
+        int residual = params_.divmod_Q.divmod(offset_q_[s], offset_npq);
+        offset_n_[s] = params_.divmod_P.divmod(offset_p_[s], residual);
+      }
+      else {
+        offset_n_[s] = offset_npq / (problem_size_.P * problem_size_.Q);
+        int residual = offset_npq % (problem_size_.P * problem_size_.Q);
+
+        offset_p_[s] = residual / problem_size_.Q;
+        offset_q_[s] = residual % problem_size_.Q;
+      }
+    }
+
+    set_iteration_index(0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static Params getParams(Conv2dProblemSize const &problem_size, Layout const &layout) {
+    return Params(problem_size, layout);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_vector_ = index % kAccessesPerVector;
+    int residual_access = index / kAccessesPerVector;
+    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+
+    rsc_index_ += kPositionsPerTile * problem_size_.split_k_slices;
+  }
+
+  /// Returns the coordinate in the activations tensor X that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+    int n = offset_n_[iteration_strided_];
+    int p = offset_p_[iteration_strided_];
+    int q = offset_q_[iteration_strided_];
+
+    int rsc_index = rsc_index_ + iteration_vector_ * AccessType::kElements;
+
+    int r = 0;
+    int s = 0;
+    int c = 0;
+
+    if (kUseFastDivmodMainloop) {
+      int rs_index = params_.divmod_C.divmod(c, rsc_index);
+      r = params_.divmod_S.divmod(s, rs_index);
+    }
+    else {
+      c = (rsc_index % problem_size_.C);
+
+      int rs_index = (rsc_index / problem_size_.C);
+      s = (rs_index % problem_size_.S);
+      r = (rs_index / problem_size_.S);
+    }
+
+    if (problem_size_.mode == Mode::kConvolution) {
+      r = (problem_size_.R - 1 - r);
+      s = (problem_size_.S - 1 - s);
+    }
+
+    int stride_h = kStrideH;
+    if (!kStrideH) {
+      stride_h = problem_size_.stride_h;
+    }
+
+    int stride_w = kStrideW;
+    if (!kStrideW) {
+      stride_w = problem_size_.stride_w;
+    }
+
+    int dilation_h = kDilationH;
+    if (!kDilationH) {
+      dilation_h = problem_size_.dilation_h;
+    }
+
+    int dilation_w = kDilationW;
+    if (!kDilationW) {
+      dilation_w = problem_size_.dilation_w;
+    }
+
+    int h = p * stride_h - problem_size_.pad_h + r * dilation_h;
+    int w = q * stride_w - problem_size_.pad_w + s * dilation_w;
+
+    return TensorCoord(n, h, w, c);
+  }
+
+  /// Returns true if the current coordinate is within the activations tensor X
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+
+    TensorCoord coord = at();
+
+    bool in_bounds =
+      coord.n() < problem_size_.N &&
+      coord.h() >= 0 && coord.h() < problem_size_.H &&
+      coord.w() >= 0 && coord.w() < problem_size_.W &&
+      coord.c() < problem_size_.C;
+
+    return in_bounds;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    TensorCoord coord = at();
+
+    int32_t offset =
+      coord.n() * params_.stride_n +
+      coord.h() * params_.stride_h +
+      coord.w() * params_.stride_w +
+      coord.c();
+
+    AccessType const *ptr = reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+
+    return ptr;
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropActivationTileAccessIteratorFewChannels &operator++() {
+    ++iteration_vector_;
+    if (iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+    iteration_vector_ = 0;
+
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.C % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (kDilationH && problem_size.dilation_h != kDilationH) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (kDilationW && problem_size.dilation_w != kDilationW) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (kStrideH && problem_size.stride_h != kStrideH) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (kStrideW && problem_size.stride_w != kStrideW) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (platform::is_same<Layout, layout::TensorNCxHWx<32>>::value) {
+      if (problem_size.C % 32) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    if (platform::is_same<Layout, layout::TensorNCxHWx<64>>::value) {
+      if (problem_size.C % 64) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_fixed_channels.h b/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_fixed_channels.h
new file mode 100644
index 00000000..a4bc830a
--- /dev/null
+++ b/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_fixed_channels.h
@@ -0,0 +1,353 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM A (activation tile)
+    matrix from memory.
+
+    This iterator assumes TensorNHWC or TensorNCxHWx<Interleave> layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad).
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/threadblock/conv2d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename Layout_,
+  typename ThreadMap_,
+  typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess>
+>
+class Conv2dFpropActivationTileAccessIteratorFixedChannels {
+public:
+
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = Layout_;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kFixedChannels;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+
+  static int const kFilterPositionsPerTile = Shape::kColumn / AccessType::kElements;
+
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+
+  static bool const kUseFastDivmodPrologue = true;
+  static bool const kUseFastDivmodMainloop = true;
+
+  static int const kStrideH = 0;
+  static int const kStrideW = 0;
+  static int const kDilationH = 0;
+  static int const kDilationW = 0;
+
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements),
+    "Vectors implied by the thread map must be divisible by the access type.");
+
+  //
+  // Simplifying assertions
+  //
+  static_assert(ThreadMap::Iterations::kContiguous == 1,
+    "Require Iterations::kContiguous == 1");
+
+  //
+  // Parameters structure
+  //
+
+  using Params = Conv2dFewChannelsParams<Layout>;
+
+private:
+
+  Params const &params_;
+  Conv2dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  LongIndex iteration_vector_;
+  char const *pointer_;
+
+  int rs_index_;
+  int offset_n_[ThreadMap::Iterations::kStrided];
+  int offset_p_[ThreadMap::Iterations::kStrided];
+  int offset_q_[ThreadMap::Iterations::kStrided];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropActivationTileAccessIteratorFixedChannels(
+    Params const &params,
+    Conv2dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()       // tile index - units are threadblock-scoped tiles
+  ):
+    params_(params),
+    problem_size_(problem_size),
+    pointer_(reinterpret_cast<char const *>(ptr)),
+    rs_index_(0) {
+
+    //
+    // This requires problem_size.C == AccessType::kElements
+    //
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    rs_index_ = (threadblock_offset.column() + thread_coord.contiguous()) / AccessType::kElements;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      int offset_npq = threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
+
+      if (kUseFastDivmodPrologue) {
+        int residual = params_.divmod_Q.divmod(offset_q_[s], offset_npq);
+        offset_n_[s] = params_.divmod_P.divmod(offset_p_[s], residual);
+      }
+      else {
+        offset_n_[s] = offset_npq / (problem_size_.P * problem_size_.Q);
+        int residual = offset_npq % (problem_size_.P * problem_size_.Q);
+
+        offset_p_[s] = residual / problem_size_.Q;
+        offset_q_[s] = residual % problem_size_.Q;
+      }
+    }
+
+    set_iteration_index(0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static Params getParams(Conv2dProblemSize const &problem_size, Layout const &layout) {
+    return Params(problem_size, layout);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_vector_ = index % kAccessesPerVector;
+    int residual_access = index / kAccessesPerVector;
+    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+
+    rs_index_ += kFilterPositionsPerTile * problem_size_.split_k_slices;
+  }
+
+  /// Returns the coordinate in the activations tensor X that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+    int n = offset_n_[iteration_strided_];
+    int p = offset_p_[iteration_strided_];
+    int q = offset_q_[iteration_strided_];
+
+    int rs_index = rs_index_ + iteration_vector_;
+
+    int r = 0;
+    int s = 0;
+
+    if (kUseFastDivmodMainloop) {
+      r = params_.divmod_S.divmod(s, rs_index);
+    }
+    else {
+      s = (rs_index % problem_size_.S);
+      r = (rs_index / problem_size_.S);
+    }
+
+    if (problem_size_.mode == Mode::kConvolution) {
+      r = (problem_size_.R - 1 - r);
+      s = (problem_size_.S - 1 - s);
+    }
+
+    int stride_h = kStrideH;
+    if (!kStrideH) {
+      stride_h = problem_size_.stride_h;
+    }
+
+    int stride_w = kStrideW;
+    if (!kStrideW) {
+      stride_w = problem_size_.stride_w;
+    }
+
+    int dilation_h = kDilationH;
+    if (!kDilationH) {
+      dilation_h = problem_size_.dilation_h;
+    }
+
+    int dilation_w = kDilationW;
+    if (!kDilationW) {
+      dilation_w = problem_size_.dilation_w;
+    }
+
+    int h = p * stride_h - problem_size_.pad_h + r * dilation_h;
+    int w = q * stride_w - problem_size_.pad_w + s * dilation_w;
+
+    return TensorCoord(n, h, w, 0);
+  }
+
+  /// Returns true if the current coordinate is within the activations tensor X
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+
+    TensorCoord coord = at();
+
+    return coord.n() < problem_size_.N &&
+      coord.h() >= 0 && coord.h() < problem_size_.H &&
+      coord.w() >= 0 && coord.w() < problem_size_.W;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    TensorCoord coord = at();
+
+    int32_t offset =
+      coord.n() * params_.stride_n +
+      coord.h() * params_.stride_h +
+      coord.w() * params_.stride_w + coord.c();
+
+    AccessType const *ptr = reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+
+    return ptr;
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropActivationTileAccessIteratorFixedChannels &operator++() {
+    ++iteration_vector_;
+    if (iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+    iteration_vector_ = 0;
+
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.C != AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (kDilationH && problem_size.dilation_h != kDilationH) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (kDilationW && problem_size.dilation_w != kDilationW) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (kStrideH && problem_size.stride_h != kStrideH) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (kStrideW && problem_size.stride_w != kStrideW) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (platform::is_same<Layout, layout::TensorNCxHWx<32>>::value) {
+      if (problem_size.C % 32) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    if (platform::is_same<Layout, layout::TensorNCxHWx<64>>::value) {
+      if (problem_size.C % 64) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h
index 4ca15ae9..147d4f1a 100644
--- a/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h
+++ b/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h
index 25b254f3..08d3176d 100644
--- a/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h
+++ b/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_few_channels.h b/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_few_channels.h
new file mode 100644
index 00000000..7fb30ad9
--- /dev/null
+++ b/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_few_channels.h
@@ -0,0 +1,289 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM B (filter tile)
+    matrix from memory.
+
+    This iterator assumes TensorNHWC or TensorCxRSKx<Interleave> layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad).
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/threadblock/conv2d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename Layout_,
+  typename ThreadMap_,
+  typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess>
+>
+class Conv2dFpropFilterTileAccessIteratorFewChannels {
+public:
+
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = Layout_;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kFewChannels;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+  static int const kPositionsPerTile = Shape::kRow;
+
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+
+  static bool const kUseFastDivmodPrologue = true;
+  static bool const kUseFastDivmodMainloop = true;
+
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements),
+    "Vectors implied by the thread map must be divisible by the access type.");
+
+  //
+  // Simplifying assertions
+  //
+  static_assert(ThreadMap::Iterations::kContiguous == 1,
+    "Require Iterations::kContiguous == 1");
+
+  //
+  // Parameters structure
+  //
+
+  using Params = Conv2dFewChannelsParams<Layout>;
+
+private:
+
+  Params const &params_;
+  Conv2dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  LongIndex iteration_vector_;
+  char const *pointer_;
+
+  int rsc_index_;
+
+  int offset_k_[ThreadMap::Iterations::kStrided];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropFilterTileAccessIteratorFewChannels(
+    Params const &params,
+    Conv2dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    params_(params),
+    problem_size_(problem_size),
+    pointer_(reinterpret_cast<char const *>(ptr)),
+    rsc_index_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    rsc_index_ = (threadblock_offset.row() + thread_coord.contiguous());
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      offset_k_[s] = threadblock_offset.column() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
+    }
+
+    set_iteration_index(0);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_vector_ = index % kAccessesPerVector;
+    int residual_access = index / kAccessesPerVector;
+    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * 8 / sizeof_bits<Element>::value;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    // moves to the next tile
+    rsc_index_ += kPositionsPerTile * problem_size_.split_k_slices;
+  }
+
+  /// Returns the coordinate in the filter tensor W that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+
+    int rsc_index = rsc_index_ + iteration_vector_ * AccessType::kElements;
+
+    int c = 0;
+    int s = 0;
+    int r = 0;
+
+    if (kUseFastDivmodMainloop) {
+      int rs_index = params_.divmod_C.divmod(c, rsc_index);
+      r = params_.divmod_S.divmod(s, rs_index);
+    }
+    else {
+      c = (rsc_index % problem_size_.C);
+      int rs_index = (rsc_index / problem_size_.C);
+
+      s = (rs_index % problem_size_.S);
+      r = (rs_index / problem_size_.S);
+    }
+
+    int k = offset_k_[iteration_strided_];
+
+    return TensorCoord(k, r, s, c);
+  }
+
+  /// Returns true if the current coordinate is within the activations tensor W
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+
+    TensorCoord coord = at();
+
+    bool in_bounds =
+      coord.n() < problem_size_.K &&
+      coord.h() >= 0 &&
+      coord.h() < problem_size_.R &&
+      coord.c() < problem_size_.C;
+
+    return in_bounds;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    TensorCoord coord = at();
+
+    int32_t offset =
+      coord.n() * params_.stride_n +
+      coord.h() * params_.stride_h +
+      coord.w() * params_.stride_w +
+      coord.c();
+
+    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropFilterTileAccessIteratorFewChannels &operator++() {
+    ++iteration_vector_;
+    if (iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+    iteration_vector_ = 0;
+
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.C % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (platform::is_same<Layout, layout::TensorCxRSKx<32>>::value) {
+      if (problem_size.K % 32) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    if (platform::is_same<Layout, layout::TensorCxRSKx<64>>::value) {
+      if (problem_size.K % 64) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_fixed_channels.h b/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_fixed_channels.h
new file mode 100644
index 00000000..7b101235
--- /dev/null
+++ b/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_fixed_channels.h
@@ -0,0 +1,275 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM B (filter tile)
+    matrix from memory.
+
+    This iterator assumes TensorNHWC or TensorCxRSKx<Interleave> layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad).
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/threadblock/conv2d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename Layout_,
+  typename ThreadMap_,
+  typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess>
+>
+class Conv2dFpropFilterTileAccessIteratorFixedChannels {
+public:
+
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = Layout_;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kFixedChannels;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+
+  static int const kFilterPositionsPerTile = Shape::kRow / AccessType::kElements;
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+
+  static bool const kUseFastDivmodPrologue = true;
+  static bool const kUseFastDivmodMainloop = true;
+
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements),
+    "Vectors implied by the thread map must be divisible by the access type.");
+
+  //
+  // Simplifying assertions
+  //
+  static_assert(ThreadMap::Iterations::kContiguous == 1,
+    "Require Iterations::kContiguous == 1");
+
+  //
+  // Parameters structure
+  //
+
+  using Params = Conv2dFewChannelsParams<Layout>;
+
+private:
+
+  Params const &params_;
+  Conv2dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  LongIndex iteration_vector_;
+  char const *pointer_;
+
+  int rs_index_;
+
+  int offset_k_[ThreadMap::Iterations::kStrided];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropFilterTileAccessIteratorFixedChannels(
+    Params const &params,
+    Conv2dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    params_(params),
+    problem_size_(problem_size),
+    pointer_(reinterpret_cast<char const *>(ptr)),
+    rs_index_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    rs_index_ = (threadblock_offset.row() + thread_coord.contiguous()) / AccessType::kElements;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      offset_k_[s] = threadblock_offset.column() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
+    }
+
+    set_iteration_index(0);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_vector_ = index % kAccessesPerVector;
+    int residual_access = index / kAccessesPerVector;
+    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * 8 / sizeof_bits<Element>::value;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    // moves to the next tile
+    rs_index_ += kFilterPositionsPerTile * problem_size_.split_k_slices;
+  }
+
+  /// Returns the coordinate in the filter tensor W that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+
+    int rs_index = rs_index_ + iteration_vector_;
+
+    int r = 0;
+    int s = 0;
+
+    if (kUseFastDivmodMainloop) {
+      r = params_.divmod_S.divmod(s, rs_index);
+    }
+    else {
+      s = (rs_index % problem_size_.S);
+      r = (rs_index / problem_size_.S);
+    }
+
+    int k = offset_k_[iteration_strided_];
+
+    return TensorCoord(k, r, s, 0);
+  }
+
+  /// Returns true if the current coordinate is within the activations tensor W
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+
+    TensorCoord coord = at();
+
+    return coord.n() < problem_size_.K && coord.h() >= 0 && coord.h() < problem_size_.R;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    TensorCoord coord = at();
+
+    int32_t offset =
+      coord.n() * params_.stride_n +
+      coord.h() * params_.stride_h +
+      coord.w() * params_.stride_w + coord.c();
+
+    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropFilterTileAccessIteratorFixedChannels &operator++() {
+    ++iteration_vector_;
+    if (iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+    iteration_vector_ = 0;
+
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.C != AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (platform::is_same<Layout, layout::TensorCxRSKx<32>>::value) {
+      if (problem_size.K % 32) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    if (platform::is_same<Layout, layout::TensorCxRSKx<64>>::value) {
+      if (problem_size.K % 64) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h
index 1ff04531..2f6a1243 100644
--- a/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h
+++ b/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/conv/threadblock/conv2d_params.h b/include/cutlass/conv/threadblock/conv2d_params.h
index 511504de..1ba9532c 100644
--- a/include/cutlass/conv/threadblock/conv2d_params.h
+++ b/include/cutlass/conv/threadblock/conv2d_params.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -77,6 +83,48 @@ struct Conv2dAnalyticParams {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+/// Params structure used for all Conv2d analytic tile iterators
+template< typename Layout_ = layout::TensorNHWC >
+struct Conv2dFewChannelsParams {
+
+  using Layout = Layout_;
+
+
+  int32_t stride_w;
+  int32_t stride_h;
+  int32_t stride_n;
+
+  FastDivmod divmod_P;
+  FastDivmod divmod_Q;
+  FastDivmod divmod_S;
+  FastDivmod divmod_C;
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  Conv2dFewChannelsParams() { }
+
+  CUTLASS_HOST_DEVICE
+  Conv2dFewChannelsParams(
+    Conv2dProblemSize const &problem_size,  // unused; placeholder to match other Params interfaces.
+    Layout const &layout
+  ):
+    stride_w(int32_t(layout.stride()[0])),
+    stride_h(int32_t(layout.stride()[1])),
+    stride_n(int32_t(layout.stride()[2])),
+    divmod_P(problem_size.P),
+    divmod_Q(problem_size.Q),
+    divmod_S(problem_size.S),
+    divmod_C(problem_size.C)
+  {
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
 /// Parameters structure used for Conv2dDgradOutputGradientTileAccessIteratorAnalyticParams
 struct Conv2dDgradOutputGradientTileAccessIteratorAnalyticParams {
   
@@ -773,6 +821,8 @@ struct Conv2dWgradActivationIteratorOptimizedParams {
   FastDivmod pq_divmod;
   FastDivmod q_divmod;
   FastDivmod c_divmod;
+  FastDivmod s_divmod;
+  int small_channel_conv_s_offset;
 
   //
   // Methods
@@ -789,8 +839,9 @@ struct Conv2dWgradActivationIteratorOptimizedParams {
     sc_divmod(problem_size.S * problem_size.C),
     pq_divmod(problem_size.P * problem_size.Q),
     q_divmod(problem_size.Q),
-    c_divmod(problem_size.C) {
-
+    c_divmod(problem_size.C),
+    s_divmod(problem_size.S * problem_size.dilation_w),
+    small_channel_conv_s_offset((problem_size.S - 1) * problem_size.dilation_w - problem_size.pad_w) {
   }
 
   CUTLASS_HOST_DEVICE
diff --git a/include/cutlass/conv/threadblock/conv2d_tile_iterator.h b/include/cutlass/conv/threadblock/conv2d_tile_iterator.h
index 72c52c2e..66dd75d2 100644
--- a/include/cutlass/conv/threadblock/conv2d_tile_iterator.h
+++ b/include/cutlass/conv/threadblock/conv2d_tile_iterator.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_analytic.h
index 70d84187..ec68cc89 100644
--- a/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_analytic.h
+++ b/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_analytic.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -210,7 +216,7 @@ class Conv2dWgradActivationTileAccessIteratorAnalytic {
    
     int h = p * problem_size_.stride_h - problem_size_.pad_h + r * problem_size_.dilation_h;
     int w = q * problem_size_.stride_w - problem_size_.pad_w + s * problem_size_.dilation_w;
-  
+ 
     return TensorCoord(n, h, w, c);
   }
 
diff --git a/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_optimized.h
index c4e886a3..fec9dcda 100644
--- a/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_optimized.h
+++ b/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_optimized.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -213,17 +219,19 @@ class Conv2dWgradActivationTileAccessIteratorOptimized {
       if (problem_size_.mode == Mode::kConvolution) {
         s -= (problem_size_.dilation_w * wrap_c);
         
-        int wrap_s = (s == -problem_size_.pad_w - problem_size_.dilation_w); 
-        s = wrap_s ? (-problem_size_.pad_w + (problem_size_.S - 1) * problem_size_.dilation_w): s;
-        
+        int wrap_s;
+        params_.s_divmod(wrap_s, s, params_.small_channel_conv_s_offset - s);
+        s = params_.small_channel_conv_s_offset - s;
+
         r -= (problem_size_.dilation_h * wrap_s);
 
       } else {
         s += (problem_size_.dilation_w * wrap_c);
-        
-        int wrap_s = (s == (-problem_size_.pad_w + problem_size_.S * problem_size_.dilation_w)); 
-        s = wrap_s ? -problem_size_.pad_w : s;
-        
+
+        int wrap_s;
+        params_.s_divmod(wrap_s, s, s + problem_size_.pad_w);
+        s -= problem_size_.pad_w;
+
         r += (problem_size_.dilation_h * wrap_s);
       }
     }
diff --git a/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_analytic.h
index c9e12973..0a26d646 100644
--- a/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_analytic.h
+++ b/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_analytic.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_optimized.h
index deacd8d7..aac0e3c3 100644
--- a/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_optimized.h
+++ b/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_optimized.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_analytic.h
index f21cec8c..331132c7 100644
--- a/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_analytic.h
+++ b/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_analytic.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_optimized.h
index e65feb4c..a5a760db 100644
--- a/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_optimized.h
+++ b/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_optimized.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_analytic.h
index 2eff5751..0fc623e2 100644
--- a/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_analytic.h
+++ b/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_analytic.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_optimized.h
index 86854354..3f6b36e9 100644
--- a/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_optimized.h
+++ b/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_optimized.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_analytic.h
index 009d0b1a..a192bfd4 100644
--- a/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_analytic.h
+++ b/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_analytic.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_optimized.h
index 25b5d3cf..d0e89bd6 100644
--- a/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_optimized.h
+++ b/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_optimized.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_analytic.h
index b56cee75..8b388b0e 100644
--- a/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_analytic.h
+++ b/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_analytic.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_optimized.h
index c19de1af..9b69dbcb 100644
--- a/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_optimized.h
+++ b/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_optimized.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/conv/threadblock/conv3d_params.h b/include/cutlass/conv/threadblock/conv3d_params.h
index 4e30751b..5ad1e4fa 100644
--- a/include/cutlass/conv/threadblock/conv3d_params.h
+++ b/include/cutlass/conv/threadblock/conv3d_params.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_analytic.h
index 039272df..ebf7e84e 100644
--- a/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_analytic.h
+++ b/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_analytic.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_optimized.h
index c9a0df93..0413c7dc 100644
--- a/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_optimized.h
+++ b/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_optimized.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_analytic.h
index 2779b1c9..4d57f0ba 100644
--- a/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_analytic.h
+++ b/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_analytic.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_optimized.h
index 9ba29b06..2bc5971b 100644
--- a/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_optimized.h
+++ b/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_optimized.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/conv/threadblock/implicit_gemm_fprop_fusion_multistage.h b/include/cutlass/conv/threadblock/implicit_gemm_fprop_fusion_multistage.h
index 5a32f7d9..4f16b42d 100644
--- a/include/cutlass/conv/threadblock/implicit_gemm_fprop_fusion_multistage.h
+++ b/include/cutlass/conv/threadblock/implicit_gemm_fprop_fusion_multistage.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/conv/threadblock/implicit_gemm_multistage.h b/include/cutlass/conv/threadblock/implicit_gemm_multistage.h
index 6033772b..36b41aac 100644
--- a/include/cutlass/conv/threadblock/implicit_gemm_multistage.h
+++ b/include/cutlass/conv/threadblock/implicit_gemm_multistage.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -321,7 +327,7 @@ class ImplicitGemmMultistage :
               sizeof_bits<typename IteratorB::Element>::value *
               IteratorB::ThreadMap::kElementsPerAccess /
               IteratorB::kAccessesPerVector / 8;
-  
+
           cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
               dst_ptr + v, iterator_B.get(), iterator_B.valid());
   
diff --git a/include/cutlass/conv/threadblock/implicit_gemm_pipelined.h b/include/cutlass/conv/threadblock/implicit_gemm_pipelined.h
index 28374b87..f77e2e33 100644
--- a/include/cutlass/conv/threadblock/implicit_gemm_pipelined.h
+++ b/include/cutlass/conv/threadblock/implicit_gemm_pipelined.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/conv/threadblock/implicit_gemm_wgrad_fusion_multistage.h b/include/cutlass/conv/threadblock/implicit_gemm_wgrad_fusion_multistage.h
index 5cd87b1b..7066997a 100644
--- a/include/cutlass/conv/threadblock/implicit_gemm_wgrad_fusion_multistage.h
+++ b/include/cutlass/conv/threadblock/implicit_gemm_wgrad_fusion_multistage.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/conv/threadblock/predicated_scale_bias_vector_access_iterator.h b/include/cutlass/conv/threadblock/predicated_scale_bias_vector_access_iterator.h
index db2c2cd6..7d60e4b0 100644
--- a/include/cutlass/conv/threadblock/predicated_scale_bias_vector_access_iterator.h
+++ b/include/cutlass/conv/threadblock/predicated_scale_bias_vector_access_iterator.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT,INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/conv/threadblock/predicated_scale_bias_vector_iterator.h b/include/cutlass/conv/threadblock/predicated_scale_bias_vector_iterator.h
index 1719afee..7b63c485 100644
--- a/include/cutlass/conv/threadblock/predicated_scale_bias_vector_iterator.h
+++ b/include/cutlass/conv/threadblock/predicated_scale_bias_vector_iterator.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT,INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/conv/threadblock/regular_scale_bias_vector_access_iterator.h b/include/cutlass/conv/threadblock/regular_scale_bias_vector_access_iterator.h
index 5f3116a9..ba8a4bb5 100644
--- a/include/cutlass/conv/threadblock/regular_scale_bias_vector_access_iterator.h
+++ b/include/cutlass/conv/threadblock/regular_scale_bias_vector_access_iterator.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/conv/threadblock/threadblock_swizzle.h b/include/cutlass/conv/threadblock/threadblock_swizzle.h
index cc258e71..06d1bdfa 100644
--- a/include/cutlass/conv/threadblock/threadblock_swizzle.h
+++ b/include/cutlass/conv/threadblock/threadblock_swizzle.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/conv/warp/conv2d_fprop_scale_bias_iterator.h b/include/cutlass/conv/warp/conv2d_fprop_scale_bias_iterator.h
index 5fa48afd..85b8dde2 100644
--- a/include/cutlass/conv/warp/conv2d_fprop_scale_bias_iterator.h
+++ b/include/cutlass/conv/warp/conv2d_fprop_scale_bias_iterator.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/conv/warp/scale_bias_relu_transform.h b/include/cutlass/conv/warp/scale_bias_relu_transform.h
index 0f7e04ce..5bcbfcd0 100644
--- a/include/cutlass/conv/warp/scale_bias_relu_transform.h
+++ b/include/cutlass/conv/warp/scale_bias_relu_transform.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/coord.h b/include/cutlass/coord.h
index 02343c84..1fe8ec0f 100644
--- a/include/cutlass/coord.h
+++ b/include/cutlass/coord.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/core_io.h b/include/cutlass/core_io.h
index 4ad4f3f4..eef43602 100644
--- a/include/cutlass/core_io.h
+++ b/include/cutlass/core_io.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -222,21 +228,19 @@ std::ostream & operator<<(std::ostream &out, GemmCoord const &gemm_coord) {
 
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
-//                       stream operators for cutlass::layout namespace                          //
+//                       stream operators for cutlass namespace                          //
 ///////////////////////////////////////////////////////////////////////////////////////////////////
-namespace layout {
 
 /// Default printing to ostream for PitchLinearShape
 template < int Contiguous, int Strided>
 inline
 std::ostream & operator<<(std::ostream &out, PitchLinearShape<Contiguous, Strided> const &pitch_linear_shape) {
-  out << "cutlass::layout::PitchLinearShape:(kContiguous, kStrided) {"
+  out << "cutlass::PitchLinearShape:(kContiguous, kStrided) {"
     << cutlass::layout::PitchLinearShape<Contiguous,Strided>::kContiguous <<","
     << cutlass::layout::PitchLinearShape<Contiguous,Strided>::kStrided <<"}";
   return out;
 }
 
-} //namespace layout
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
 
diff --git a/include/cutlass/cutlass.h b/include/cutlass/cutlass.h
index 25406616..ebc4c1e9 100644
--- a/include/cutlass/cutlass.h
+++ b/include/cutlass/cutlass.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -39,12 +45,26 @@
 
 #define CUTLASS_UNUSED(expr) do { (void)(expr); } while (0)
 
+#if !defined(__CUDACC_RTC__)
+
+#include <assert.h>
+
 #if defined(_MSC_VER)
   #define CUTLASS_NOT_IMPLEMENTED() assert(0 && __FUNCSIG__)
 #else
   #define CUTLASS_NOT_IMPLEMENTED() assert(0 && __PRETTY_FUNCTION__)
 #endif
 
+#else
+
+#if defined(_MSC_VER)
+  #define CUTLASS_NOT_IMPLEMENTED() assert(0 && __FUNCSIG__)
+#else
+  #define CUTLASS_NOT_IMPLEMENTED() assert(0 && __PRETTY_FUNCTION__)
+#endif
+
+#endif
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 namespace cutlass {
@@ -74,6 +94,7 @@ enum class Status {
   kErrorInternal,              ///< An error within CUTLASS occurred.
   kErrorArchMismatch,          ///< CUTLASS runs on a device that it was not compiled for.
   kErrorInsufficientDriver,    ///< CUTLASS runs with a driver that is too old.
+  kErrorMemoryAllocation,      ///< Kernel launch failed due to insufficient device memory.
   kInvalid                     ///< Status is unspecified.
 };
 
@@ -100,7 +121,9 @@ static char const* cutlassGetStatusString(cutlass::Status status) {
     case cutlass::Status::kErrorInsufficientDriver:
       return "Error Insufficient Driver";
     case cutlass::Status::kErrorArchMismatch:
-      return "Erroor Architecture Mismatch";
+      return "Error Architecture Mismatch";
+    case cutlass::Status::kErrorMemoryAllocation:
+      return "Error Memory Allocation failed";
     case cutlass::Status::kInvalid: break;
   }
 
@@ -155,6 +178,17 @@ static const int NUM_THREADS_PER_QUAD_PAIR = NUM_THREADS_PER_QUAD * 2;
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
+/// Helper function to return true when called by thread 0 of threadblock 0.
+CUTLASS_HOST_DEVICE bool thread0() {
+  #if defined(__CUDA_ARCH__)
+    return (!threadIdx.x && !threadIdx.y && !threadIdx.z) && (!blockIdx.x && !blockIdx.y && !blockIdx.z);
+  #else
+    return false;
+  #endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
 }  // namespace cutlass
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/device_kernel.h b/include/cutlass/device_kernel.h
index d73318fa..1de33024 100644
--- a/include/cutlass/device_kernel.h
+++ b/include/cutlass/device_kernel.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/epilogue/thread/activation.h b/include/cutlass/epilogue/thread/activation.h
index 26071666..c30a8209 100644
--- a/include/cutlass/epilogue/thread/activation.h
+++ b/include/cutlass/epilogue/thread/activation.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2022, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -56,6 +62,7 @@ struct Identity {
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// ReLu operator - propagates NaNs
+/// Always put threshold in the right hand side of max to propagate NaN.
 template <typename T>
 struct ReLu {
   static const bool kIsHeavy=false;
@@ -81,7 +88,7 @@ struct ReLu<Array<T, N>> {
   Array<T, N> operator()(T const & threshold, Array<T, N> const &frag) const {
     maximum<Array<T, N> > mx;
 
-    return mx(threshold, frag);
+    return mx(frag, threshold);
   }
 
   CUTLASS_HOST_DEVICE
diff --git a/include/cutlass/epilogue/thread/conversion_op.h b/include/cutlass/epilogue/thread/conversion_op.h
index e21a96d8..baf9eb7d 100644
--- a/include/cutlass/epilogue/thread/conversion_op.h
+++ b/include/cutlass/epilogue/thread/conversion_op.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/epilogue/thread/linear_combination.h b/include/cutlass/epilogue/thread/linear_combination.h
index 703de589..004b3a7c 100644
--- a/include/cutlass/epilogue/thread/linear_combination.h
+++ b/include/cutlass/epilogue/thread/linear_combination.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -180,7 +186,6 @@ class LinearCombination {
     if (Scale == ScaleType::Nothing)
       return destination_converter(converted_accumulator);
 
-
     // Perform binary operations
     ComputeFragment intermediate;
 
diff --git a/include/cutlass/epilogue/thread/linear_combination_bias_elementwise.h b/include/cutlass/epilogue/thread/linear_combination_bias_elementwise.h
index 389eb26e..d85384f5 100644
--- a/include/cutlass/epilogue/thread/linear_combination_bias_elementwise.h
+++ b/include/cutlass/epilogue/thread/linear_combination_bias_elementwise.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/epilogue/thread/linear_combination_bias_relu.h b/include/cutlass/epilogue/thread/linear_combination_bias_relu.h
index cb3a4faa..63168330 100644
--- a/include/cutlass/epilogue/thread/linear_combination_bias_relu.h
+++ b/include/cutlass/epilogue/thread/linear_combination_bias_relu.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/epilogue/thread/linear_combination_clamp.h b/include/cutlass/epilogue/thread/linear_combination_clamp.h
index bb98b0ec..185a3d1b 100644
--- a/include/cutlass/epilogue/thread/linear_combination_clamp.h
+++ b/include/cutlass/epilogue/thread/linear_combination_clamp.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2022, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/epilogue/thread/linear_combination_dgelu.h b/include/cutlass/epilogue/thread/linear_combination_dgelu.h
index 2429fdaa..d80682f4 100644
--- a/include/cutlass/epilogue/thread/linear_combination_dgelu.h
+++ b/include/cutlass/epilogue/thread/linear_combination_dgelu.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/epilogue/thread/linear_combination_drelu.h b/include/cutlass/epilogue/thread/linear_combination_drelu.h
index 2b853487..74b090d3 100644
--- a/include/cutlass/epilogue/thread/linear_combination_drelu.h
+++ b/include/cutlass/epilogue/thread/linear_combination_drelu.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/epilogue/thread/linear_combination_gelu.h b/include/cutlass/epilogue/thread/linear_combination_gelu.h
index 5ad986a0..92712433 100644
--- a/include/cutlass/epilogue/thread/linear_combination_gelu.h
+++ b/include/cutlass/epilogue/thread/linear_combination_gelu.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2022, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/epilogue/thread/linear_combination_generic.h b/include/cutlass/epilogue/thread/linear_combination_generic.h
index 1712a283..e7f55ffe 100644
--- a/include/cutlass/epilogue/thread/linear_combination_generic.h
+++ b/include/cutlass/epilogue/thread/linear_combination_generic.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2022, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/epilogue/thread/linear_combination_hardswish.h b/include/cutlass/epilogue/thread/linear_combination_hardswish.h
index d6f57270..851712bf 100644
--- a/include/cutlass/epilogue/thread/linear_combination_hardswish.h
+++ b/include/cutlass/epilogue/thread/linear_combination_hardswish.h
@@ -1,24 +1,30 @@
 /*************************************************************************************************** 
-* Copyright (c) 2017-2022, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/epilogue/thread/linear_combination_planar_complex.h b/include/cutlass/epilogue/thread/linear_combination_planar_complex.h
index 34f938b9..7773b055 100644
--- a/include/cutlass/epilogue/thread/linear_combination_planar_complex.h
+++ b/include/cutlass/epilogue/thread/linear_combination_planar_complex.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/epilogue/thread/linear_combination_relu.h b/include/cutlass/epilogue/thread/linear_combination_relu.h
index 7bd79a3b..5743adc7 100644
--- a/include/cutlass/epilogue/thread/linear_combination_relu.h
+++ b/include/cutlass/epilogue/thread/linear_combination_relu.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2022, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/epilogue/thread/linear_combination_relu0.h b/include/cutlass/epilogue/thread/linear_combination_relu0.h
index 0cb4f956..4fe1d3fe 100644
--- a/include/cutlass/epilogue/thread/linear_combination_relu0.h
+++ b/include/cutlass/epilogue/thread/linear_combination_relu0.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2022, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/epilogue/thread/linear_combination_residual_block.h b/include/cutlass/epilogue/thread/linear_combination_residual_block.h
index 263bdcc2..c32ab7c4 100644
--- a/include/cutlass/epilogue/thread/linear_combination_residual_block.h
+++ b/include/cutlass/epilogue/thread/linear_combination_residual_block.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -94,13 +100,13 @@ class LinearCombinationResidualBlock {
   };
 
 private:
-  
+
   ElementCompute alpha_;
   ElementCompute beta_;
   bool skip_elementwise_;
 
 public:
-  
+
   /// Constructor from Params
   CUTLASS_HOST_DEVICE
   LinearCombinationResidualBlock(Params const &params) {
diff --git a/include/cutlass/epilogue/thread/linear_combination_sigmoid.h b/include/cutlass/epilogue/thread/linear_combination_sigmoid.h
index eb5516ed..752897f0 100644
--- a/include/cutlass/epilogue/thread/linear_combination_sigmoid.h
+++ b/include/cutlass/epilogue/thread/linear_combination_sigmoid.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2022, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/epilogue/thread/linear_combination_silu.h b/include/cutlass/epilogue/thread/linear_combination_silu.h
index 79ef792e..641dfb3e 100644
--- a/include/cutlass/epilogue/thread/linear_combination_silu.h
+++ b/include/cutlass/epilogue/thread/linear_combination_silu.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2022, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/epilogue/thread/linear_combination_with_elementwise.h b/include/cutlass/epilogue/thread/linear_combination_with_elementwise.h
index 304a472b..4c34256e 100644
--- a/include/cutlass/epilogue/thread/linear_combination_with_elementwise.h
+++ b/include/cutlass/epilogue/thread/linear_combination_with_elementwise.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/epilogue/thread/reduction_op.h b/include/cutlass/epilogue/thread/reduction_op.h
index fb94dd17..38fe9096 100644
--- a/include/cutlass/epilogue/thread/reduction_op.h
+++ b/include/cutlass/epilogue/thread/reduction_op.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/epilogue/thread/scale_type.h b/include/cutlass/epilogue/thread/scale_type.h
index 576b4194..043797c6 100644
--- a/include/cutlass/epilogue/thread/scale_type.h
+++ b/include/cutlass/epilogue/thread/scale_type.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h b/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h
index 10f39eec..cce51ad6 100644
--- a/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h
+++ b/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op_blas3.h b/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op_blas3.h
new file mode 100644
index 00000000..77dceb4a
--- /dev/null
+++ b/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op_blas3.h
@@ -0,0 +1,264 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped complex GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/linear_combination_relu.h"
+#include "cutlass/epilogue/thread/linear_combination_gelu.h"
+#include "cutlass/epilogue/thread/linear_combination_sigmoid.h"
+#include "cutlass/epilogue/thread/linear_combination_planar_complex.h"
+
+#include "cutlass/epilogue/thread/conversion_op.h"
+#include "cutlass/epilogue/thread/reduction_op.h"
+
+#include "cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h"
+
+#include "cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h"
+#include "cutlass/epilogue/warp/fragment_iterator_gaussian_complex_tensor_op.h"
+#include "cutlass/epilogue/warp/tile_iterator_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_thread_map_tensor_op.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_blas3.h"
+#include "cutlass/epilogue/threadblock/shared_load_iterator.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Specialization and defines sensible defaults for epilogues for complex*complex case
+//  4 real-valued mma operations (Complex)
+//  A = (ar + j ai), B (br +j bi), D = AB
+//  D = dr + j di = (ar*br - ai*bi) + j (ar*bi + ai*br) 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  /// Epilouge Shape
+  typename Shape_,
+  /// Warp-level mma operator
+  typename WarpMmaTensorOp_,
+  /// Number of k partitions
+  int PartitionsK,
+  /// Epilogue output operator
+  typename OutputOp_,
+  /// Elements accessed by inner-most loop of AccumulatorFragmentIterator::load()
+  int ElementsPerAccess,
+  /// Multiply-add operator 
+  /// Selects between (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex) 
+  typename Operator_ = arch::OpMultiplyAddComplex,
+  /// Is for a symmetric kernel
+  BlasMode BlasMode_ = BlasMode::kGemm
+> 
+struct DefaultEpilogueComplexTensorOpBlas3 {
+
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  using Operator = Operator_;
+  static BlasMode const kBlasMode = BlasMode_;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+
+  //
+  // Thread map
+  //
+
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapTensorOp<
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess
+  >::Type;
+
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorBlas3<
+    OutputTileThreadMap,
+    ElementOutput
+    , kBlasMode
+  >;
+
+  using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorComplexTensorOp<
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::ElementC,
+    typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+    LayoutC
+  >;
+
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp<
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    ElementAccumulator,
+    LayoutC
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    typename OutputTileThreadMap::CompactedThreadMap,
+    ElementAccumulator
+  >;
+
+  /// Hard-coded padding elements added 
+  using Padding = cutlass::MatrixShape<0, 0>;
+
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization and defines sensible defaults for epilogues for complex*complex case
+//  3 real-valued mma operations (Gaussian Complex)
+//  A  = (ar + j ai), B = (br +j bi), D = AB
+//  P1 = (ar + ai) * br, P2 = - ar * (br - bi), P3 = ai * (br + bi) 
+//  D  = dr + j di = (P1 - P3) + j (P1 + P2)
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  typename Shape_,
+  typename WarpMmaTensorOp_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess, 
+  BlasMode BlasMode_
+>
+struct DefaultEpilogueComplexTensorOpBlas3 <Shape_, WarpMmaTensorOp_, PartitionsK, 
+                                      OutputOp_, ElementsPerAccess, 
+                                      arch::OpMultiplyAddGaussianComplex
+                                      , BlasMode_
+> {
+
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  using Operator = arch::OpMultiplyAddGaussianComplex;
+  static BlasMode const kBlasMode = BlasMode_;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+
+  //
+  // Thread map
+  //
+
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapTensorOp<
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess
+  >::Type;
+
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorBlas3<
+    OutputTileThreadMap,
+    ElementOutput,
+    kBlasMode
+  >;
+
+  using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorGaussianComplexTensorOp<
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::ElementC,
+    typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+    LayoutC
+  >;
+
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp<
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    ElementAccumulator,
+    LayoutC
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    typename OutputTileThreadMap::CompactedThreadMap,
+    ElementAccumulator
+  >;
+
+  /// Hard-coded padding elements added 
+  using Padding = cutlass::MatrixShape<0, 0>;
+
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_direct_store.h b/include/cutlass/epilogue/threadblock/default_epilogue_direct_store.h
index 62744d51..3ee95a62 100644
--- a/include/cutlass/epilogue/threadblock/default_epilogue_direct_store.h
+++ b/include/cutlass/epilogue/threadblock/default_epilogue_direct_store.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_planar_complex.h b/include/cutlass/epilogue/threadblock/default_epilogue_planar_complex.h
index c56e588f..5ddb99d1 100644
--- a/include/cutlass/epilogue/threadblock/default_epilogue_planar_complex.h
+++ b/include/cutlass/epilogue/threadblock/default_epilogue_planar_complex.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_simt.h b/include/cutlass/epilogue/threadblock/default_epilogue_simt.h
index 8bceb538..ce665251 100644
--- a/include/cutlass/epilogue/threadblock/default_epilogue_simt.h
+++ b/include/cutlass/epilogue/threadblock/default_epilogue_simt.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -72,7 +78,8 @@ template <
   typename Shape_,
   typename WarpMmaSimt_,
   typename OutputOp_,
-  int ElementsPerAccess
+  int ElementsPerAccess,
+  bool ScatterD = false
 >
 struct DefaultEpilogueSimt {
 
@@ -101,7 +108,8 @@ struct DefaultEpilogueSimt {
 
   using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
     OutputTileThreadMap,
-    ElementOutput
+    ElementOutput,
+    ScatterD
   >;
 
   using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorSimt<
diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h b/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h
index 177c3367..a7742c08 100644
--- a/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h
+++ b/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2022, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -128,155 +134,139 @@ struct DefaultIteratorsTensorOp<float, float, 4, ThreadblockShape, WarpShape, In
   static int const kFragmentsPerIteration = 2;
 };
 
-/// Partial specialization for half <= float x 8 epilogues avoids shared memory bank conflicts.
+/// Partial specialization for int32_t <= int32_t x 4
 template <
   typename ThreadblockShape,
   typename WarpShape,
   typename InstructionShape,
   typename ThreadMap
 >
-struct DefaultIteratorsTensorOp<
-  half_t, 
-  float, 
-  8, 
-  ThreadblockShape, 
-  WarpShape, 
-  InstructionShape, 
-  ThreadMap> {
+struct DefaultIteratorsTensorOp<int32_t, int32_t, 4, ThreadblockShape, WarpShape, InstructionShape, ThreadMap> {
   
-  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOpMixed<
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp<
     WarpShape,
     InstructionShape,
-    float,
-    32,
-    16,
-    8,
-    8
+    int32_t,
+    layout::RowMajor
   >;
 
-  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIteratorMixed<
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
     ThreadMap,
-    float,
-    32,
-    16,
-    8,
-    8
+    int32_t
   >;
 
-  static int const kFragmentsPerIteration = 2;
+  static int const kFragmentsPerIteration = 1;
 };
 
-/// Partial specialization for int8_t x 16 <= int32_t x 16 epilogues avoids shared memory bank conflicts.
+/// Partial specialization for half <= float x 8 epilogues avoids shared memory bank conflicts.
 template <
-  int K,
+  typename ThreadblockShape,
+  typename WarpShape,
   typename InstructionShape,
   typename ThreadMap
 >
 struct DefaultIteratorsTensorOp<
-  int8_t, 
-  int32_t, 
-  16, 
-  gemm::GemmShape<128, 128, K>, 
-  gemm::GemmShape<64, 64, K>, 
+  half_t, 
+  float, 
+  8, 
+  ThreadblockShape, 
+  WarpShape, 
   InstructionShape, 
   ThreadMap> {
   
   using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOpMixed<
-    gemm::GemmShape<64, 64, K>,
+    WarpShape,
     InstructionShape,
-    int32_t,
+    float,
     32,
-    8,
     16,
+    8,
     8
   >;
 
   using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIteratorMixed<
     ThreadMap,
-    int32_t,
+    float,
     32,
-    8,
     16,
+    8,
     8
   >;
 
-  static int const kFragmentsPerIteration = 1;
+  static int const kFragmentsPerIteration = 2;
 };
 
-/// Partial specialization for int8_t x 8 <= int32_t x 8 epilogues avoids shared memory bank conflicts.
+/// Partial specialization for int8/int4b_t <= int32 x 16/8 epilogues avoids shared memory bank conflicts.
+/// Threadblock::kN = 256 still has bank conflicts.
 template <
-  int K,
+  typename ElementOutput,
+  int ElementsPerAccess,
+  typename ThreadblockShape,
+  typename WarpShape,
   typename InstructionShape,
   typename ThreadMap
 >
 struct DefaultIteratorsTensorOp<
-  int8_t, 
+  ElementOutput, 
   int32_t, 
-  8, 
-  gemm::GemmShape<128, 64, K>, 
-  gemm::GemmShape<64, 32, K>, 
+  ElementsPerAccess,
+  ThreadblockShape, 
+  WarpShape, 
   InstructionShape, 
   ThreadMap> {
+
+  static_assert(cutlass::platform::is_same<ElementOutput, cutlass::int4b_t>::value ||
+                cutlass::platform::is_same<ElementOutput, cutlass::uint4b_t>::value ||
+                cutlass::platform::is_same<ElementOutput, int8_t>::value ||
+                cutlass::platform::is_same<ElementOutput, uint8_t>::value,
+                "ElementOutput needs to be 4 or 8 bit (unsigned) int.");
+
+   static_assert((ElementsPerAccess == 16 || ElementsPerAccess == 8),
+                "ElementsPerAccess needs to be 16 or 8.");
   
-  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOpMixed<
-    gemm::GemmShape<64, 32, K>,
+  using WarpTileIteratorMixed = cutlass::epilogue::warp::TileIteratorTensorOpMixed<
+    WarpShape,
     InstructionShape,
     int32_t,
     32,
-    8,
-    8,
+    cutlass::sizeof_bits<ElementOutput>::value,
+    ElementsPerAccess,
     8
   >;
 
-  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIteratorMixed<
-    ThreadMap,
+  using WarpTileIteratorNotMixed =  cutlass::epilogue::warp::TileIteratorTensorOp<
+    WarpShape,
+    InstructionShape,
     int32_t,
-    32,
-    8,
-    8,
-    8
+    layout::RowMajor
   >;
 
-  static int const kFragmentsPerIteration = 1;
-};
+  using WarpTileIterator = typename cutlass::platform::conditional<
+                             (ThreadblockShape::kN == 256),
+                             WarpTileIteratorNotMixed,
+                             WarpTileIteratorMixed>::type;
 
-/// Partial specialization for int8_t x 8 <= int32_t x 8 epilogues avoids shared memory bank conflicts.
-template <
-  int K,
-  typename InstructionShape,
-  typename ThreadMap
->
-struct DefaultIteratorsTensorOp<
-  int8_t, 
-  int32_t, 
-  8, 
-  gemm::GemmShape<64, 64, K>, 
-  gemm::GemmShape<32, 32, K>, 
-  InstructionShape, 
-  ThreadMap> {
-  
-  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOpMixed<
-    gemm::GemmShape<32, 32, K>,
-    InstructionShape,
+  using SharedLoadIteratorMixed = cutlass::epilogue::threadblock::SharedLoadIteratorMixed<
+    ThreadMap,
     int32_t,
     32,
-    8,
-    8,
+    cutlass::sizeof_bits<ElementOutput>::value,
+    ElementsPerAccess,
     8
   >;
 
-  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIteratorMixed<
+  using SharedLoadIteratorNotMixed = cutlass::epilogue::threadblock::SharedLoadIterator<
     ThreadMap,
-    int32_t,
-    32,
-    8,
-    8,
-    8
+    int32_t
   >;
 
+  using SharedLoadIterator = typename cutlass::platform::conditional<
+                             (ThreadblockShape::kN == 256),
+                             SharedLoadIteratorNotMixed,
+                             SharedLoadIteratorMixed>::type;
+
   static int const kFragmentsPerIteration = 1;
 };
-
 } // namespace detail
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -287,7 +277,8 @@ template <
   typename WarpMmaTensorOp_,
   int PartitionsK,
   typename OutputOp_,
-  int ElementsPerAccess
+  int ElementsPerAccess,
+  bool ScatterD = false
 >
 struct DefaultEpilogueTensorOp {
 
@@ -318,10 +309,11 @@ struct DefaultEpilogueTensorOp {
   using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
     OutputTileThreadMap,
     ElementOutput,
+    ScatterD,
     UseCUDAStore
   >;
 
-  using AccumulatorFragmentIterator = typename std::conditional<is_complex<ElementOutput>::value,
+  using AccumulatorFragmentIterator = typename platform::conditional<is_complex<ElementOutput>::value,
                                     cutlass::epilogue::warp::FragmentIteratorComplexTensorOp<
                                         typename WarpMmaTensorOp::Shape,
                                         typename WarpMmaTensorOp::Policy::Operator::Shape,
@@ -410,7 +402,7 @@ struct DefaultEpilogueTensorOpStridedDgrad {
     ElementOutput
   >;
 
-  using AccumulatorFragmentIterator = typename std::conditional<is_complex<ElementOutput>::value,
+  using AccumulatorFragmentIterator = typename platform::conditional<is_complex<ElementOutput>::value,
                                     cutlass::epilogue::warp::FragmentIteratorComplexTensorOp<
                                         typename WarpMmaTensorOp::Shape,
                                         typename WarpMmaTensorOp::Policy::Operator::Shape,
@@ -503,7 +495,7 @@ struct DefaultEpilogueTensorOpAffineRankN {
   >;
 
   // Map to the row major iterator since the iterator selection for affineN is the same.
-  using AccumulatorFragmentIterator = typename std::conditional<is_complex<ElementOutput>::value,
+  using AccumulatorFragmentIterator = typename platform::conditional<is_complex<ElementOutput>::value,
                                     cutlass::epilogue::warp::FragmentIteratorComplexTensorOp<
                                         typename WarpMmaTensorOp::Shape,
                                         typename WarpMmaTensorOp::Policy::Operator::Shape,
diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op_blas3.h b/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op_blas3.h
new file mode 100644
index 00000000..6b796b35
--- /dev/null
+++ b/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op_blas3.h
@@ -0,0 +1,175 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/linear_combination_clamp.h"
+#include "cutlass/epilogue/thread/linear_combination_relu.h"
+#include "cutlass/epilogue/thread/linear_combination_gelu.h"
+#include "cutlass/epilogue/thread/linear_combination_sigmoid.h"
+#include "cutlass/epilogue/thread/linear_combination_planar_complex.h"
+
+#include "cutlass/epilogue/thread/conversion_op.h"
+#include "cutlass/epilogue/thread/reduction_op.h"
+
+#include "cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h"
+
+#include "cutlass/epilogue/warp/fragment_iterator_tensor_op.h"
+#include "cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h"
+#include "cutlass/epilogue/warp/tile_iterator_tensor_op.h"
+#include "cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h"
+#include "cutlass/epilogue/threadblock/default_thread_map_tensor_op.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_blas3.h"
+#include "cutlass/epilogue/threadblock/shared_load_iterator.h"
+#include "cutlass/epilogue/threadblock/shared_load_iterator_mixed.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/threadblock/interleaved_epilogue.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues for TensorOps.
+template <
+  typename Shape_,
+  typename WarpMmaTensorOp_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess,
+  /// Is for a symmetric kernel
+  BlasMode BlasMode_ = BlasMode::kGemm
+>
+struct DefaultEpilogueTensorOpBlas3 {
+
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static BlasMode const kBlasMode = BlasMode_;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+
+  //
+  // Thread map
+  //
+
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapTensorOp<
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess
+  >::Type;
+
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorBlas3<
+    OutputTileThreadMap,
+    ElementOutput,
+    kBlasMode
+  >;
+
+  using AccumulatorFragmentIterator = typename std::conditional<is_complex<ElementOutput>::value,
+                                    cutlass::epilogue::warp::FragmentIteratorComplexTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        LayoutC>,
+                                    cutlass::epilogue::warp::FragmentIteratorTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        LayoutC> >::type;
+
+  /// Support several implementations depending on structure of epilogue
+  using DefaultIterators = detail::DefaultIteratorsTensorOp<
+    ElementOutput,
+    ElementAccumulator,
+    kElementsPerAccess,
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    typename OutputTileThreadMap::CompactedThreadMap
+  >;
+
+  using WarpTileIterator = typename DefaultIterators::WarpTileIterator;
+  using SharedLoadIterator = typename DefaultIterators::SharedLoadIterator;
+
+  /// Hard-coded padding elements added 
+  using Padding = cutlass::MatrixShape<0, 64 / sizeof_bits<ElementAccumulator>::value * 4>;
+
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h b/include/cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h
index f60b9403..3b42c889 100644
--- a/include/cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h
+++ b/include/cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -74,7 +80,8 @@ template <
   typename WarpMmaTensorOp_,
   int PartitionsK,
   typename OutputOp_,
-  int ElementsPerAccess
+  int ElementsPerAccess,
+  bool ScatterD = false
 >
 struct DefaultEpilogueVoltaTensorOp {
 
@@ -103,7 +110,8 @@ struct DefaultEpilogueVoltaTensorOp {
 
   using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
     OutputTileThreadMap,
-    ElementOutput
+    ElementOutput,
+    ScatterD
   >;
 
   using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorVoltaTensorOp<
diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_with_broadcast.h b/include/cutlass/epilogue/threadblock/default_epilogue_with_broadcast.h
index 5c898bc6..ebac2c46 100644
--- a/include/cutlass/epilogue/threadblock/default_epilogue_with_broadcast.h
+++ b/include/cutlass/epilogue/threadblock/default_epilogue_with_broadcast.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -60,7 +66,8 @@ template <
   typename ElementTensor,
   typename ElementVector,
   typename OutputOp,
-  int ElementsPerAccess
+  int ElementsPerAccess,
+  bool ScatterD = false
 >
 struct DefaultEpilogueWithBroadcastTensorOp {
 
@@ -78,7 +85,8 @@ struct DefaultEpilogueWithBroadcastTensorOp {
   //
   using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
     typename Base::OutputTileThreadMap,
-    ElementOutput
+    ElementOutput,
+    ScatterD
   >;
 
   //
diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_with_reduction.h b/include/cutlass/epilogue/threadblock/default_epilogue_with_reduction.h
index b9882cdb..223d7a7c 100644
--- a/include/cutlass/epilogue/threadblock/default_epilogue_with_reduction.h
+++ b/include/cutlass/epilogue/threadblock/default_epilogue_with_reduction.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -60,7 +66,8 @@ template <
   typename ElementOutput,
   typename OutputOp,
   typename ReductionOp,
-  int ElementsPerAccess
+  int ElementsPerAccess,
+  bool ScatterD = false
 >
 struct DefaultEpilogueWithReductionTensorOp {
 
@@ -81,7 +88,8 @@ struct DefaultEpilogueWithReductionTensorOp {
 
   using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
     typename Base::OutputTileThreadMap,
-    ElementOutput
+    ElementOutput,
+    ScatterD
   >;
 
   /// Define the epilogue
@@ -111,7 +119,8 @@ template <
   typename ElementOutput,
   typename OutputOp,
   typename ReductionOp,
-  int ElementsPerAccess
+  int ElementsPerAccess,
+  bool ScatterD = false
 >
 struct DefaultEpilogueWithReductionVoltaTensorOp {
 
@@ -132,7 +141,8 @@ struct DefaultEpilogueWithReductionVoltaTensorOp {
 
   using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
     typename Base::OutputTileThreadMap,
-    ElementOutput
+    ElementOutput,
+    ScatterD
   >;
 
   /// Define the epilogue
diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h b/include/cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h
index 7e572d65..e35065e3 100644
--- a/include/cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h
+++ b/include/cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -72,7 +78,8 @@ template <
   typename WarpMmaTensorOp_,
   int PartitionsK,
   typename OutputOp_,
-  int ElementsPerAccess
+  int ElementsPerAccess,
+  bool ScatterD = false
 >
 struct DefaultEpilogueWmmaTensorOp {
 
@@ -101,7 +108,8 @@ struct DefaultEpilogueWmmaTensorOp {
 
   using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
     OutputTileThreadMap,
-    ElementOutput
+    ElementOutput,
+    ScatterD
   >;
 
   using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorWmmaTensorOp<
diff --git a/include/cutlass/epilogue/threadblock/default_thread_map_simt.h b/include/cutlass/epilogue/threadblock/default_thread_map_simt.h
index 0a876652..984108f8 100644
--- a/include/cutlass/epilogue/threadblock/default_thread_map_simt.h
+++ b/include/cutlass/epilogue/threadblock/default_thread_map_simt.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/epilogue/threadblock/default_thread_map_tensor_op.h b/include/cutlass/epilogue/threadblock/default_thread_map_tensor_op.h
index 24ccae80..b58eef26 100644
--- a/include/cutlass/epilogue/threadblock/default_thread_map_tensor_op.h
+++ b/include/cutlass/epilogue/threadblock/default_thread_map_tensor_op.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/epilogue/threadblock/default_thread_map_volta_tensor_op.h b/include/cutlass/epilogue/threadblock/default_thread_map_volta_tensor_op.h
index 6d88a1b3..2c858401 100644
--- a/include/cutlass/epilogue/threadblock/default_thread_map_volta_tensor_op.h
+++ b/include/cutlass/epilogue/threadblock/default_thread_map_volta_tensor_op.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/epilogue/threadblock/default_thread_map_wmma_tensor_op.h b/include/cutlass/epilogue/threadblock/default_thread_map_wmma_tensor_op.h
index 94e6bf0e..e58ba9a7 100644
--- a/include/cutlass/epilogue/threadblock/default_thread_map_wmma_tensor_op.h
+++ b/include/cutlass/epilogue/threadblock/default_thread_map_wmma_tensor_op.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/epilogue/threadblock/direct_store_epilogue_iterator.h b/include/cutlass/epilogue/threadblock/direct_store_epilogue_iterator.h
index 6e3a418b..c6a462b4 100644
--- a/include/cutlass/epilogue/threadblock/direct_store_epilogue_iterator.h
+++ b/include/cutlass/epilogue/threadblock/direct_store_epilogue_iterator.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -114,7 +120,8 @@ class DirectStoreEpilogueIterator {
     Element *pointer_,
     TensorCoord extent_,
     int thread_idx_,
-    TensorCoord threadblock_offset_ = TensorCoord()
+    TensorCoord threadblock_offset_ = TensorCoord(),
+    int const * indices = nullptr
   ): 
     pointer(pointer_),
     stride(params.stride / sizeof(Element)),
diff --git a/include/cutlass/epilogue/threadblock/epilogue.h b/include/cutlass/epilogue/threadblock/epilogue.h
index dd0f7539..9ee85637 100644
--- a/include/cutlass/epilogue/threadblock/epilogue.h
+++ b/include/cutlass/epilogue/threadblock/epilogue.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -54,7 +60,7 @@
 
 #include "cutlass/epilogue/threadblock/epilogue_base.h"
 #include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
-#include "cutlass/util/index_sequence.h"
+#include "cutlass/numeric_types.h"
 
 ////////////////////////////////////////////////////////////////////////////////
 
@@ -149,7 +155,6 @@ class Epilogue :
 
 public:
 
-
   static_assert(SharedLoadIterator::Fragment::kElements == OutputTileIterator::Fragment::kElements,
     "Mismatch between shared load iterator and output tile iterator.");
 
diff --git a/include/cutlass/epilogue/threadblock/epilogue_base.h b/include/cutlass/epilogue/threadblock/epilogue_base.h
index a4483367..8f52ebdd 100644
--- a/include/cutlass/epilogue/threadblock/epilogue_base.h
+++ b/include/cutlass/epilogue/threadblock/epilogue_base.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/epilogue/threadblock/epilogue_direct_store.h b/include/cutlass/epilogue/threadblock/epilogue_direct_store.h
index 028aee42..4bbfafaf 100644
--- a/include/cutlass/epilogue/threadblock/epilogue_direct_store.h
+++ b/include/cutlass/epilogue/threadblock/epilogue_direct_store.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/epilogue/threadblock/epilogue_gemm_k_reduction.h b/include/cutlass/epilogue/threadblock/epilogue_gemm_k_reduction.h
index ae79c96f..63cd4c6e 100644
--- a/include/cutlass/epilogue/threadblock/epilogue_gemm_k_reduction.h
+++ b/include/cutlass/epilogue/threadblock/epilogue_gemm_k_reduction.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -54,7 +60,7 @@
 
 #include "cutlass/epilogue/threadblock/epilogue_base.h"
 #include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
-#include "cutlass/util/index_sequence.h"
+#include "cutlass/numeric_types.h"
 
 namespace cutlass {
 namespace epilogue {
diff --git a/include/cutlass/epilogue/threadblock/epilogue_planar_complex.h b/include/cutlass/epilogue/threadblock/epilogue_planar_complex.h
index 9366406e..89f1fa56 100644
--- a/include/cutlass/epilogue/threadblock/epilogue_planar_complex.h
+++ b/include/cutlass/epilogue/threadblock/epilogue_planar_complex.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -32,8 +38,6 @@
 
 #pragma once
 
-#include <assert.h>
-
 #include "cutlass/cutlass.h"
 #include "cutlass/numeric_types.h"
 #include "cutlass/array.h"
diff --git a/include/cutlass/epilogue/threadblock/epilogue_smem_accumulator.h b/include/cutlass/epilogue/threadblock/epilogue_smem_accumulator.h
index 77d21f05..d87f645d 100644
--- a/include/cutlass/epilogue/threadblock/epilogue_smem_accumulator.h
+++ b/include/cutlass/epilogue/threadblock/epilogue_smem_accumulator.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -140,6 +146,7 @@ class EpilogueSmemAccumulator {
         FragmentSmemAccessType * smem_frag_ptr =  
           reinterpret_cast<FragmentSmemAccessType *>(&tb_frag_smem);
   
+        CUTLASS_PRAGMA_UNROLL
         for (int idx = 0; idx < AccumulatorFragmentIterator::kIterationsPerTile; ++idx) {
           frag_iterator_accum.load(tb_frag_accum);
           ++frag_iterator_accum;
@@ -161,6 +168,56 @@ class EpilogueSmemAccumulator {
       }
     }
   }
+
+  /// Streams the result to shared memory
+  CUTLASS_DEVICE
+  void operator()(
+    OutputOp const &output_op,                    ///< Output operator
+    SmemTileIterator smem_iterator,               ///< Tile iterator for destination in shared memory
+    AccumulatorTile const &accumulator) {          ///< Complete warp-level accumulator tile
+ 
+    /// Fragment Iterator to load slice of accumulator tile
+    AccumulatorFragmentIterator frag_iterator_accum(accumulator);
+    FragmentAccumulator tb_frag_accum;
+  
+    /// Epilogue output fragment
+    typename SmemTileIterator::Fragment tb_frag_smem;
+  
+    /// Iterate over the accumulator tile and store to shared memory
+    CUTLASS_PRAGMA_UNROLL
+    for (int rid = 0; rid < AccumulatorFragmentIterator::TileIterations::kRow; ++rid) {
+    
+      CUTLASS_PRAGMA_UNROLL
+      for (int cid = 0; cid < AccumulatorFragmentIterator::TileIterations::kColumn; ++cid) {
+  
+        using AccumulatorAccessType = typename OutputOp::FragmentAccumulator;
+        using FragmentSmemAccessType = typename OutputOp::FragmentOutput;
+  
+        FragmentSmemAccessType * smem_frag_ptr =  
+          reinterpret_cast<FragmentSmemAccessType *>(&tb_frag_smem);
+  
+        CUTLASS_PRAGMA_UNROLL
+        for (int idx = 0; idx < AccumulatorFragmentIterator::kIterationsPerTile; ++idx) {
+          frag_iterator_accum.load(tb_frag_accum);
+          ++frag_iterator_accum;
+  
+          AccumulatorAccessType const * accumulator_frag_ptr = 
+            reinterpret_cast<AccumulatorAccessType const *>(&tb_frag_accum);
+          const int kOutputIterations = FragmentAccumulator::kElements / OutputOp::kCount;
+  
+          CUTLASS_PRAGMA_UNROLL
+          for (int it = 0; it < kOutputIterations; it++) {
+            smem_frag_ptr[idx * kOutputIterations + it] = output_op(accumulator_frag_ptr[it]);
+          }
+        }
+  
+        smem_iterator.store(tb_frag_smem);
+        ++smem_iterator;
+  
+      }
+    }
+  }
+
 };
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/threadblock/epilogue_with_broadcast.h b/include/cutlass/epilogue/threadblock/epilogue_with_broadcast.h
index 71209b26..943cd5d0 100644
--- a/include/cutlass/epilogue/threadblock/epilogue_with_broadcast.h
+++ b/include/cutlass/epilogue/threadblock/epilogue_with_broadcast.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -59,7 +65,7 @@
 #include "cutlass/epilogue/threadblock/epilogue_base.h"
 #include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
 
-#include "cutlass/util/index_sequence.h"
+#include "cutlass/numeric_types.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
diff --git a/include/cutlass/epilogue/threadblock/epilogue_with_reduction.h b/include/cutlass/epilogue/threadblock/epilogue_with_reduction.h
index 3a4f3ac9..685777c6 100644
--- a/include/cutlass/epilogue/threadblock/epilogue_with_reduction.h
+++ b/include/cutlass/epilogue/threadblock/epilogue_with_reduction.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/epilogue/threadblock/epilogue_workspace.h b/include/cutlass/epilogue/threadblock/epilogue_workspace.h
index e26784b6..549913a7 100644
--- a/include/cutlass/epilogue/threadblock/epilogue_workspace.h
+++ b/include/cutlass/epilogue/threadblock/epilogue_workspace.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/epilogue/threadblock/interleaved_epilogue.h b/include/cutlass/epilogue/threadblock/interleaved_epilogue.h
index 58fa3899..685b512e 100644
--- a/include/cutlass/epilogue/threadblock/interleaved_epilogue.h
+++ b/include/cutlass/epilogue/threadblock/interleaved_epilogue.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -32,8 +38,6 @@
 
 #pragma once
 
-#include <assert.h>
-
 #include "cutlass/cutlass.h"
 #include "cutlass/numeric_types.h"
 #include "cutlass/array.h"
diff --git a/include/cutlass/epilogue/threadblock/output_tile_thread_map.h b/include/cutlass/epilogue/threadblock/output_tile_thread_map.h
index 8e9a37a4..83b07b99 100644
--- a/include/cutlass/epilogue/threadblock/output_tile_thread_map.h
+++ b/include/cutlass/epilogue/threadblock/output_tile_thread_map.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -66,6 +72,58 @@ struct OutputTileShape {
 
 ////////////////////////////////////////////////////////////////////////////////
 
+template <typename Iterations, typename Delta>
+struct OutputTileThreadMapHelpers {
+
+  /// Determines the iteration index of a vector access according to the thread map
+  CUTLASS_HOST_DEVICE
+  static void iteration_index(
+    int &column_idx,
+    int &row_idx,
+    int &group_idx,
+    int &cluster_idx,
+    int &tile_idx,
+    int iter_idx) {
+
+    column_idx = iter_idx % Iterations::kColumn;
+    int residual   = iter_idx / Iterations::kColumn;
+
+    row_idx    = residual % Iterations::kRow;
+    residual       = residual / Iterations::kRow;
+
+    group_idx  = residual % Iterations::kGroup;
+    residual       = residual / Iterations::kGroup;
+
+    cluster_idx = residual % Iterations::kCluster;
+    tile_idx    = residual / Iterations::kCluster;
+  }
+
+  /// Computes the offset of a given vector access
+  CUTLASS_HOST_DEVICE
+  static MatrixCoord iteration_offset(int iter_idx) {
+
+    int column_idx;
+    int row_idx;
+    int group_idx;
+    int cluster_idx;
+    int tile_idx;
+
+    iteration_index(column_idx, row_idx, group_idx, cluster_idx, tile_idx, iter_idx);
+
+    return
+      MatrixCoord(
+        row_idx     * Delta::kRow     +
+        group_idx   * Delta::kGroup   +
+        cluster_idx * Delta::kCluster +
+        tile_idx    * Delta::kTile,
+
+        column_idx  * Delta::kColumn);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
 template <
   typename ThreadMap_,
   typename Shape_,
@@ -73,7 +131,7 @@ template <
   typename Delta_,
   typename Count_
 >
-struct OutputTileThreadMap {
+struct OutputTileThreadMap : public OutputTileThreadMapHelpers<Iterations_, Delta_> {
 
   /// Conventional thread map (concept: ThreadMap)
   using ThreadMap = ThreadMap_;
@@ -365,6 +423,12 @@ struct OutputTileOptimalThreadMap {
     );
   }
 
+  /// Computes the offset of a given vector access
+  CUTLASS_HOST_DEVICE
+  static MatrixCoord iteration_offset(int iter_idx) {
+    return OutputTileThreadMapHelpers<Iterations, Delta>::iteration_offset(iter_idx);
+  }
+
   /// Compacted thread map in which the 4D region is contiguous
   struct CompactedThreadMap {
 
diff --git a/include/cutlass/epilogue/threadblock/predicated_tile_iterator.h b/include/cutlass/epilogue/threadblock/predicated_tile_iterator.h
index 294770d5..1cc8d1ce 100644
--- a/include/cutlass/epilogue/threadblock/predicated_tile_iterator.h
+++ b/include/cutlass/epilogue/threadblock/predicated_tile_iterator.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -63,6 +69,7 @@ namespace threadblock {
 template <
   typename ThreadMap_,       ///< Thread map (conept: OutputTileThreadMap)
   typename Element_,         ///< Element data type
+  bool ScatterD = false,     ///< Scatter D operand or not
   bool UseCUDAStore = false
 >
 class PredicatedTileIterator {
@@ -175,11 +182,20 @@ class PredicatedTileIterator {
   /// Extent of the matrix tile in rows
   Index extent_row_;
 
+  /// Extent of the matrix tile in rows
+  Index extent_column_;
+
   /// A thread's starting row position (assuming steady-state predicates have been computed)
   Index thread_start_row_;
 
+  /// A thread's starting column
+  Index thread_start_column_;
+
   /// Internal state counter
   int state_[3];
+
+  /// Scatter indices
+  int const *indices_; 
  
   //
   // Static asserts about internal strides
@@ -208,15 +224,19 @@ class PredicatedTileIterator {
     Element *pointer,
     TensorCoord extent,
     int thread_idx,
-    TensorCoord threadblock_offset = TensorCoord()
+    TensorCoord threadblock_offset = TensorCoord(),
+    int const *indices = nullptr
   ): 
-    params_(params)
+    params_(params), indices_(indices)
   {
 
     TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx) + threadblock_offset;
 
     extent_row_ = extent.row();
+    extent_column_ = extent.column();
+
     thread_start_row_ = thread_offset.row();
+    thread_start_column_ = thread_offset.column();
 
     // Initialize predicates
     CUTLASS_PRAGMA_UNROLL
@@ -231,11 +251,20 @@ class PredicatedTileIterator {
       mask_.clear();
     }
 
+    if (ScatterD && !indices) {
+      mask_.clear();
+    }
+
     // Initialize pointer
     byte_pointer_ = reinterpret_cast<uint8_t *>(pointer) + 
       LongIndex(thread_offset.row()) * LongIndex(params_.stride) + 
       LongIndex(thread_offset.column()) * sizeof(AccessType) / kElementsPerAccess;
 
+    if (ScatterD) {
+      byte_pointer_ = reinterpret_cast<uint8_t *>(pointer) +
+        LongIndex(thread_offset.column()) * sizeof(AccessType) / kElementsPerAccess;
+    }
+
     // Initialize internal state counter
     state_[0] = state_[1] = state_[2] = 0;
   }
@@ -273,6 +302,13 @@ class PredicatedTileIterator {
 
           AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset);
 
+          if (ScatterD && row_guard) {
+            assert(indices_);
+
+            memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset +
+              LongIndex(indices_[row_offset + thread_start_row_]) * LongIndex(params_.stride));
+          }
+
           CUTLASS_PRAGMA_UNROLL
           for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
 
@@ -290,7 +326,9 @@ class PredicatedTileIterator {
           }
 
           if (row + 1 < ThreadMap::Iterations::kRow) {
-            byte_pointer += params_.increment_row;
+            if (!ScatterD) {
+              byte_pointer += params_.increment_row;
+            }
           }
         }
 
@@ -304,6 +342,7 @@ class PredicatedTileIterator {
       }
     }
   }
+
   /// Loads a fragment from memory
   CUTLASS_DEVICE
   void load(Fragment &frag) const {
@@ -337,6 +376,13 @@ class PredicatedTileIterator {
 
           AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset);
 
+          if (ScatterD && row_guard) {
+            assert(indices_);
+
+            memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset +
+              LongIndex(indices_[row_offset + thread_start_row_]) * LongIndex(params_.stride));
+          }
+
           CUTLASS_PRAGMA_UNROLL
           for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
 
@@ -356,7 +402,9 @@ class PredicatedTileIterator {
           }
 
           if (row + 1 < ThreadMap::Iterations::kRow) {
-            byte_pointer += params_.increment_row;
+            if (!ScatterD) {
+              byte_pointer += params_.increment_row;
+            }
           }
         }
 
@@ -371,7 +419,6 @@ class PredicatedTileIterator {
     }
   }
 
-
   /// Stores a fragment to memory
   CUTLASS_DEVICE
   void store(Fragment const &frag) const {
@@ -379,24 +426,189 @@ class PredicatedTileIterator {
     store_with_byte_offset(frag, 0);
   }
 
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void downsample_load_with_byte_offset(Fragment &frag, int64_t byte_offset, int convolution_P, int convolution_Q, int add_P, int add_Q, int problem_N) const {
+
+    uint8_t *byte_pointer = byte_pointer_;
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+
+          int frag_row_idx = 
+            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow 
+            + group * ThreadMap::Delta::kGroup 
+            + cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          int output_row = row_offset + thread_start_row_;
+          int output_N = output_row / (convolution_P * convolution_Q);
+          int output_PQ = output_row % (convolution_P * convolution_Q);
+          int output_P = output_PQ / convolution_Q;
+          int output_Q = output_PQ % convolution_Q;
+
+          int input_row = output_N * 2 * convolution_P * 2 * convolution_Q +
+            (2 * output_P + add_P) * 2 * convolution_Q + 2 * output_Q + add_Q;
+
+          int64_t byte_offset = (input_row-output_row)*problem_N*sizeof(float);
+
+          AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+
+            bool guard = row_guard && mask_.predicates[column];
+
+            cutlass::arch::global_load<
+              AccessType, 
+              sizeof(AccessType)
+            >(
+                frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn +
+                         column],
+                (void *)&memory_pointer[column * ThreadMap::Delta::kColumn /
+                                        kElementsPerAccess],
+                guard);
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) {
+            byte_pointer += params_.increment_row;
+          }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) {
+          byte_pointer += params_.increment_group;
+        }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        byte_pointer += params_.increment_cluster;
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void upsample_load_with_byte_offset(Fragment &frag, int64_t byte_offset, int convolution_P, int convolution_Q, int add_P, int add_Q, int problem_N) const {
+
+    uint8_t *byte_pointer = byte_pointer_;
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+
+          int frag_row_idx = 
+            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow 
+            + group * ThreadMap::Delta::kGroup 
+            + cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          int output_row = row_offset + thread_start_row_;
+          int output_N = output_row / (convolution_P * convolution_Q);
+          int output_PQ = output_row % (convolution_P * convolution_Q);
+          int output_P = output_PQ / convolution_Q;
+          int output_Q = output_PQ % convolution_Q;
+          int row_add_P = add_P;
+          int row_add_Q = add_Q;
+	  if (output_P > convolution_P - 2) row_add_P = 0;
+	  if (output_Q > convolution_Q - 2) row_add_Q = 0;
+
+          int input_row = output_N * (convolution_P/2) * (convolution_Q/2) +
+            ((output_P + row_add_P)/2) * (convolution_Q/2) + (output_Q + row_add_Q)/2;
+
+          int64_t byte_offset = (input_row-output_row)*problem_N*sizeof(float);
+
+          AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+
+            bool guard = row_guard && mask_.predicates[column];
+
+            cutlass::arch::global_load<
+              AccessType, 
+              sizeof(AccessType)
+            >(
+                frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn +
+                         column],
+                (void *)&memory_pointer[column * ThreadMap::Delta::kColumn /
+                                        kElementsPerAccess],
+                guard);
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) {
+            byte_pointer += params_.increment_row;
+          }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) {
+          byte_pointer += params_.increment_group;
+        }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        byte_pointer += params_.increment_cluster;
+      }
+    }
+  }
+
+  CUTLASS_DEVICE
+  MatrixCoord thread_start() const {
+    return MatrixCoord(thread_start_row_, thread_start_column_);
+  }
+
   /// Need to get the thread start row from the tile iterator
   CUTLASS_DEVICE
   int32_t thread_start_row() const {
     return thread_start_row_;
   }
 
+  /// Need to get the thread start row from the tile iterator
+  CUTLASS_DEVICE
+  int32_t thread_start_column() const {
+    return thread_start_column_;
+  }
+
   /// Extent of the matrix in rows
   CUTLASS_DEVICE
   Index extent_row() const {
     return extent_row_;
   }
 
+  /// Extent of the matrix in columns
+  CUTLASS_DEVICE
+  Index extent_column() const {
+    return extent_column_;
+  }
+
   /// Advances to the next position to load or store
   CUTLASS_HOST_DEVICE
   PredicatedTileIterator &operator++() {
 
     ++state_[0];
-    byte_pointer_ += params_.advance_row;
+
+    if (!ScatterD) {
+      byte_pointer_ += params_.advance_row;
+    }
+
     thread_start_row_ += ThreadMap::Shape::kRow;
     
     if (state_[0] == ThreadMap::Count::kRow) {
@@ -582,7 +794,8 @@ class InterleavedPredicatedTileIterator {
     Element *pointer,
     TensorCoord extent,
     int thread_idx,
-    TensorCoord threadblock_offset
+    TensorCoord threadblock_offset,
+    int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
   ):
     params_(params) {
     TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx) +
diff --git a/include/cutlass/epilogue/threadblock/predicated_tile_iterator_affine.h b/include/cutlass/epilogue/threadblock/predicated_tile_iterator_affine.h
index b3aa6aa6..90164a3a 100644
--- a/include/cutlass/epilogue/threadblock/predicated_tile_iterator_affine.h
+++ b/include/cutlass/epilogue/threadblock/predicated_tile_iterator_affine.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -252,7 +258,7 @@ class PredicatedTileIteratorAffineRankN {
   /// Extent of the matrix tile in rows
   Index extent_row_;
 
-  /// Extent of the matrix tile in rows
+  /// Extent of the matrix tile in columns
   Index extent_col_;
 
   /// A thread's starting row position (assuming steady-state predicates have been computed)
@@ -263,6 +269,9 @@ class PredicatedTileIteratorAffineRankN {
 
   /// Internal state counter
   int state_[3];
+
+  /// Offsets in columns, cached for performance
+  int64_t offset_modes_n_[ThreadMap::Iterations::kColumn];
  
   //
   // Static asserts about internal strides
@@ -290,7 +299,8 @@ class PredicatedTileIteratorAffineRankN {
     Element *pointer,
     MatrixCoord extent,
     int thread_idx,
-    MatrixCoord threadblock_offset = MatrixCoord()
+    MatrixCoord threadblock_offset = MatrixCoord(),
+    int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
   ): 
     params_(params)
   {
@@ -307,10 +317,34 @@ class PredicatedTileIteratorAffineRankN {
       // Initialize predicates
       CUTLASS_PRAGMA_UNROLL
       for (int c = 0; c < ThreadMap::Iterations::kColumn; ++c) {
-  
-        mask_.predicates[c] = ((thread_offset.column() 
-          + ThreadMap::Delta::kColumn * c) < extent.column());
+
+        // 
+        // Compute coordinate and decompose into N modes
+        //
+
+        int coord_n = thread_start_column_ + c * ThreadMap::Delta::kColumn;
+
+        mask_.predicates[c] = coord_n < extent.column();
+        
+        Coord<Layout::kRank / 2, Index> modes_n;
+
+        int64_t offset_modes_n = 0;
+
+        if (kBigEndian) {
+          modes_n = CoordinateDecomposition<Layout::kRank / 2>(coord_n, params_.divmod_n);
+
+          offset_modes_n = dot(modes_n, params_.stride_n);
+        }
+        else {
+          modes_n = CoordinateDecompositionLittleEndian<Layout::kRank / 2>(coord_n, params_.divmod_n);
+
+          offset_modes_n = dot(modes_n, params_.stride_n);
+        }
+
+        offset_modes_n_[c] = offset_modes_n;
+
       }
+
       if (!pointer) {
         mask_.clear();
       }
@@ -382,18 +416,8 @@ class PredicatedTileIteratorAffineRankN {
             // Compute coordinate and decompose into N modes
             //
             
-            int coord_n = thread_start_column_ + column * ThreadMap::Delta::kColumn;
-            
-            Coord<Layout::kRank / 2, Index> modes_n;
-
             if (Layout::kRank > 2) {
-              if (kBigEndian) {
-                modes_n = CoordinateDecomposition<Layout::kRank / 2>(coord_n, params_.divmod_n);
-              } else {
-                modes_n = CoordinateDecompositionLittleEndian<Layout::kRank / 2>(coord_n, params_.divmod_n);
-              }
-
-              offset_modes_n = dot(modes_n, params_.stride_n);
+              offset_modes_n = offset_modes_n_[column];
             }
 
             //
@@ -490,19 +514,8 @@ class PredicatedTileIteratorAffineRankN {
             // Compute coordinate and decompose into N modes
             //
             
-            int coord_n = thread_start_column_ + column * ThreadMap::Delta::kColumn;
-            
-            Coord<Layout::kRank / 2, Index> modes_n;
-
             if (Layout::kRank > 2) {
-              if (kBigEndian) {
-                modes_n = CoordinateDecomposition<Layout::kRank / 2>(coord_n, params_.divmod_n);
-              }
-              else {
-                modes_n = CoordinateDecompositionLittleEndian<Layout::kRank / 2>(coord_n, params_.divmod_n);
-              }
-
-              offset_modes_n = dot(modes_n, params_.stride_n);
+              offset_modes_n = offset_modes_n_[column];
             } 
 
             //
diff --git a/include/cutlass/epilogue/threadblock/predicated_tile_iterator_blas3.h b/include/cutlass/epilogue/threadblock/predicated_tile_iterator_blas3.h
new file mode 100644
index 00000000..bbac398c
--- /dev/null
+++ b/include/cutlass/epilogue/threadblock/predicated_tile_iterator_blas3.h
@@ -0,0 +1,633 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/epilogue/threadblock/output_tile_thread_map.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_params.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator used to load and store output tile from global memory in epilogue.
+///
+/// Satisfies: ReadableTileIterator | PredicatedTileIterator | ForwardTileIterator
+///
+template <
+  typename ThreadMap_,                     ///< Thread map (conept: OutputTileThreadMap)
+  typename Element_,                        ///< Element data type
+  BlasMode BlasMode_ = BlasMode::kGemm   ///< Tile Iterator for a Symmetric or Hermitian Kernel
+>
+class PredicatedTileIteratorBlas3 {
+public:
+  using ThreadMap = ThreadMap_;
+  using Shape = typename ThreadMap::Shape;
+
+  using Element = Element_;
+
+  using Layout = layout::RowMajor;
+  using TensorRef = TensorRef<Element, Layout>;
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorCoord = MatrixCoord;
+
+  static BlasMode const kBlasMode = BlasMode_;
+
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+  static int const kThreads = ThreadMap::kThreads;
+  static int const kIterations = ThreadMap::Count::kTile;
+
+  static_assert( ThreadMap::Iterations::kRow > 0,"ThreadMap::Iterations::kRow must be > 0");
+  static_assert( ThreadMap::Iterations::kGroup > 0,"ThreadMap::Iterations::kGroup must be > 0");
+  static_assert( ThreadMap::Iterations::kCluster > 0,"ThreadMap::Iterations::kCluster must be > 0");
+  static_assert( ThreadMap::Iterations::kColumn > 0,"ThreadMap::Iterations::kColumn must be > 0");
+
+  /// Fragment object
+  using Fragment = Array<
+    Element, 
+    ThreadMap::Iterations::kColumn * 
+    ThreadMap::Iterations::kRow * 
+    ThreadMap::Iterations::kGroup * 
+    ThreadMap::Iterations::kCluster * ThreadMap::kElementsPerAccess>;
+
+  /// Memory access size
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+  static_assert( AccessType::kElements == 1, "BLAS3 Epilogue must use AccessType::kElements as 1");
+
+  //
+  // Parameters struct
+  //
+
+  /// Uses a non-template class
+  struct Params : PredicatedTileIteratorParams {
+
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout): 
+      PredicatedTileIteratorParams(
+        layout.stride(0) * int(sizeof(AccessType)) / kElementsPerAccess,
+        make_OutputTileThreadMapDesc<ThreadMap>()
+      ) 
+    {
+        
+    }
+  };
+
+  /// Mask object
+  struct Mask {
+
+    static int const kCount = ThreadMap::Iterations::kColumn;
+
+    /// Predicate state
+    bool predicates[kCount];
+
+    //
+    // Mask
+    //
+    CUTLASS_HOST_DEVICE
+    Mask() {
+      enable();
+    }
+
+    ///< Efficiently disables all accesses guarded by mask
+    CUTLASS_HOST_DEVICE void clear() {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = false;
+      }
+    }
+
+    ///< CUTLASS_HOST_DEVICE enables all accesses guarded by mask
+    CUTLASS_DEVICE void enable() {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = true;
+      }
+    }
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Parameters structure containing reference and precomputed state.
+  PredicatedTileIteratorParams params_;
+
+  /// Byte-level pointer
+  uint8_t *byte_pointer_;
+
+  /// Fill Mode for a tile on diagonal of a symmetric kernel
+  cutlass::FillMode fill_mode;
+
+  /// Array of boolean values to contain steady-state predicates
+  Mask mask_;
+
+  /// Extent of the matrix tile in rows
+  Index extent_row_;
+
+  /// A thread's starting row position (assuming steady-state predicates have been computed)
+  Index thread_start_row_;
+
+  /// Internal state counter
+  int state_[3];
+
+  /// Starting address of the matrix  
+  size_t matrix_start_addr; 
+ 
+  static_assert((kBlasMode == BlasMode::kSymmetric || kBlasMode == BlasMode::kHermitian), 
+    "Unsupported blas3 mode.");
+
+private:
+
+  //
+  // Methods
+  //
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_DEVICE
+  PredicatedTileIteratorBlas3(
+    PredicatedTileIteratorParams const & params,
+    Element *pointer,
+    TensorCoord extent,
+    int thread_idx,
+    TensorCoord threadblock_offset
+    , cutlass::FillMode fill_mode
+  ): 
+    params_(params), fill_mode(fill_mode)
+  {
+
+    TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx) + threadblock_offset;
+
+    extent_row_ = extent.row();
+    thread_start_row_ = thread_offset.row();
+
+    // Initialize predicates
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < ThreadMap::Iterations::kColumn; ++c) {
+
+      mask_.predicates[c] = ((thread_offset.column() 
+        + ThreadMap::Delta::kColumn * c) < extent.column());
+    }
+
+    // Check Symmetric kernel modes (Lower and Upper - for diagonal CTAs, None for rest CTAs)
+    if ((kBlasMode == BlasMode::kSymmetric || kBlasMode == BlasMode::kHermitian) && 
+        fill_mode == cutlass::FillMode::kInvalid) {
+      arch::device_breakpoint();
+    }
+
+    // Starting address of the matrix
+    matrix_start_addr =  reinterpret_cast<size_t>(pointer); 
+
+    // Initialize pointer
+    byte_pointer_ = reinterpret_cast<uint8_t *>(pointer) + 
+      LongIndex(thread_offset.row()) * LongIndex(params_.stride) + 
+      LongIndex(thread_offset.column()) * sizeof(AccessType) / kElementsPerAccess;
+
+    // Initialize internal state counter
+    state_[0] = state_[1] = state_[2] = 0;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment &frag, int64_t byte_offset) {
+
+    uint8_t *byte_pointer = byte_pointer_;
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+
+          int frag_row_idx = 
+            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow 
+            + group * ThreadMap::Delta::kGroup 
+            + cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+
+            bool guard = row_guard && mask_.predicates[column];
+
+            cutlass::arch::global_load<
+              AccessType, 
+              sizeof(AccessType)
+            >(
+                frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn +
+                         column],
+                (void *)&memory_pointer[column * ThreadMap::Delta::kColumn /
+                                        kElementsPerAccess],
+                guard);
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) {
+            byte_pointer += params_.increment_row;
+          }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) {
+          byte_pointer += params_.increment_group;
+        }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        byte_pointer += params_.increment_cluster;
+      }
+    }
+  }
+
+  /// Loads a fragment on the diagonal of a symmetric kernel to memory 
+  CUTLASS_DEVICE
+  void load_symmetric_with_byte_offset(Fragment &frag, int64_t byte_offset) {
+
+    uint8_t *byte_pointer = byte_pointer_;
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    bool isLowerMode = (fill_mode == cutlass::FillMode::kLower) ? true : false;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+
+          int frag_row_idx = 
+            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow 
+            + group * ThreadMap::Delta::kGroup 
+            + cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset);
+
+          // Offset of row from beginning of the matrix per thread
+          size_t row_start_offset = (size_t)memory_pointer - matrix_start_addr;
+
+          // Absolute row index
+          int row_index = int(row_start_offset/params_.stride);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+
+            bool guard = row_guard && mask_.predicates[column];
+
+            // Offset of column from beginning of row per thread     
+            size_t col_start_offset = row_start_offset + 
+                        (column * ThreadMap::Delta::kColumn / kElementsPerAccess) * sizeof(AccessType);
+
+            // Absolute column index
+            size_t col_index = (col_start_offset%params_.stride)/sizeof(AccessType);
+            guard = guard && ( (isLowerMode && row_index >= col_index) ||
+                               (!isLowerMode && row_index <= col_index) );
+
+            cutlass::arch::global_load<
+              AccessType, 
+              sizeof(AccessType)
+            >(
+                frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn +
+                         column],
+                (void *)&memory_pointer[column * ThreadMap::Delta::kColumn /
+                                        kElementsPerAccess],
+                guard);
+
+            // The imaginary parts of the diagonal elements of a complex element are assumed and set to zero
+            if (guard && kBlasMode == BlasMode::kHermitian && cutlass::is_complex<Element>::value) {
+              Element *scalar_ptr = reinterpret_cast<Element *>(frag_ptr);
+
+              if (row_index == col_index) {
+                scalar_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column] = 
+                  real(scalar_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column]);
+              }
+            }
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) {
+            byte_pointer += params_.increment_row;
+          }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) {
+          byte_pointer += params_.increment_group;
+        }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        byte_pointer += params_.increment_cluster;
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    
+    if (fill_mode == cutlass::FillMode::kNone) {
+      load_with_byte_offset(frag, 0);
+    }
+    else {
+      load_symmetric_with_byte_offset(frag, 0);
+    }
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, int64_t byte_offset) {
+    uint8_t *byte_pointer = byte_pointer_;
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+
+          int frag_row_idx = 
+            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow 
+            + group * ThreadMap::Delta::kGroup 
+            + cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+
+            bool guard = row_guard && mask_.predicates[column];
+
+            cutlass::arch::global_store<AccessType, sizeof(AccessType)>(
+                frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column],
+                (void *)&memory_pointer[column * ThreadMap::Delta::kColumn / kElementsPerAccess],
+                guard);
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) {
+            byte_pointer += params_.increment_row;
+          }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) {
+          byte_pointer += params_.increment_group;
+        }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        byte_pointer += params_.increment_cluster;
+      }
+    }
+  }
+
+  /// Stores a fragment on the diagonal of a symmetric kernel to memory 
+  CUTLASS_DEVICE
+  void store_symmetric_with_byte_offset(Fragment const &frag, int64_t byte_offset) {
+    uint8_t *byte_pointer = byte_pointer_;
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    bool isLowerMode = (fill_mode == cutlass::FillMode::kLower) ? true : false;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+
+          int frag_row_idx = 
+            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow 
+            + group * ThreadMap::Delta::kGroup 
+            + cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset);
+
+          // Offset of row from beginning of the matrix per thread
+          size_t row_start_offset = (size_t)memory_pointer - matrix_start_addr;
+
+          // Absolute row index
+          int row_index = int(row_start_offset/params_.stride);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+
+            bool guard = row_guard && mask_.predicates[column];
+
+            // Offset of column from beginning of row per thread     
+            size_t col_start_offset = row_start_offset + 
+                        (column * ThreadMap::Delta::kColumn / kElementsPerAccess) * sizeof(AccessType);
+
+            // Absolute column index
+            size_t col_index = (col_start_offset%params_.stride)/sizeof(AccessType);
+
+            guard = guard && ( (isLowerMode && row_index >= col_index) ||
+                               (!isLowerMode && row_index <= col_index) );
+
+            // The imaginary parts of the diagonal elements of a complex element are assumed and set to zero
+            if (guard && kBlasMode == BlasMode::kHermitian && cutlass::is_complex<Element>::value) {
+
+              AccessType *frag_ptr_modify = const_cast<AccessType *>(frag_ptr);
+              Element *scalar_ptr = reinterpret_cast<Element *>(frag_ptr_modify);
+
+              if (row_index == col_index) {
+                scalar_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column] = 
+                  real(scalar_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column]);
+              }
+            }
+
+            cutlass::arch::global_store<AccessType, sizeof(AccessType)>(
+                frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn +
+                         column],
+                (void *)&memory_pointer[column * ThreadMap::Delta::kColumn /
+                                        kElementsPerAccess],
+                guard);
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) {
+            byte_pointer += params_.increment_row;
+          }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) {
+          byte_pointer += params_.increment_group;
+        }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        byte_pointer += params_.increment_cluster;
+      }
+    }
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+    
+    if (fill_mode == cutlass::FillMode::kNone) {
+      store_with_byte_offset(frag, 0);
+    }
+    else {
+      store_symmetric_with_byte_offset(frag, 0); 
+    }
+
+  }
+
+  /// Advances to the next position to load or store
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorBlas3 &operator++() {
+
+    ++state_[0];
+    byte_pointer_ += params_.advance_row;
+    thread_start_row_ += ThreadMap::Shape::kRow;
+    
+    if (state_[0] == ThreadMap::Count::kRow) {
+
+      state_[0] = 0;
+      ++state_[1];
+      byte_pointer_ += params_.advance_group;
+
+      thread_start_row_ += (ThreadMap::Shape::kGroup - 1) * 
+        ThreadMap::Shape::kRow * ThreadMap::Count::kRow;
+
+      if (state_[1] == ThreadMap::Count::kGroup) {
+
+        state_[1] = 0;
+        ++state_[2];
+        byte_pointer_ += params_.advance_cluster;
+
+        thread_start_row_ += ThreadMap::Count::kGroup * 
+          ThreadMap::Shape::kGroup * ThreadMap::Count::kRow * ThreadMap::Shape::kRow;
+
+        if (state_[2] == ThreadMap::Count::kCluster) {
+          state_[2] = 0;
+          byte_pointer_ += params_.advance_tile;
+        }
+      }
+    }
+
+    return *this;
+  }
+
+  ///< Efficiently disables all accesses guarded by mask
+  CUTLASS_DEVICE void clear_mask() {
+    mask_.clear();
+  }
+
+  ///< Efficiently enables all accesses guarded by mask
+  CUTLASS_DEVICE void enable_mask() {
+    mask_.enable();
+  }
+
+  ///< Sets the mask
+  CUTLASS_DEVICE void get_mask(Mask &mask) {
+    mask = mask_;
+  }
+
+  ///< Sets the mask
+  CUTLASS_DEVICE void set_mask(Mask const &mask) {
+    mask_ = mask;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/threadblock/predicated_tile_iterator_params.h b/include/cutlass/epilogue/threadblock/predicated_tile_iterator_params.h
index 143f9500..792c1697 100644
--- a/include/cutlass/epilogue/threadblock/predicated_tile_iterator_params.h
+++ b/include/cutlass/epilogue/threadblock/predicated_tile_iterator_params.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -74,6 +80,13 @@ struct OutputTileShapeDesc {
   int count() const {
     return column * row * group * cluster * tile;
   }
+
+  #if 0
+  CUTLASS_HOST_DEVICE
+  void print() const {
+    printf("{%d, %d, %d, %d, %d}", column, row, group, cluster, tile);
+  }
+  #endif
 };
 
 /// Helper template to construct an OutputTileShapeDesc from a OutputTileShape template.
@@ -122,7 +135,10 @@ struct OutputTileThreadMapDesc {
     shape(shape_),
     iterations(iterations_),
     delta(delta_),
-    count(count_) { }
+    count(count_) 
+  {
+    
+  }
 };
 
 /// Helper template to construct an OutputTileShapeDesc from a OutputTileThreadMap template.
diff --git a/include/cutlass/epilogue/threadblock/predicated_tile_iterator_predicates.h b/include/cutlass/epilogue/threadblock/predicated_tile_iterator_predicates.h
index d7a63421..8c657ff8 100644
--- a/include/cutlass/epilogue/threadblock/predicated_tile_iterator_predicates.h
+++ b/include/cutlass/epilogue/threadblock/predicated_tile_iterator_predicates.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/epilogue/threadblock/predicated_tile_iterator_strided_dgrad.h b/include/cutlass/epilogue/threadblock/predicated_tile_iterator_strided_dgrad.h
index 103fe2be..f730e854 100644
--- a/include/cutlass/epilogue/threadblock/predicated_tile_iterator_strided_dgrad.h
+++ b/include/cutlass/epilogue/threadblock/predicated_tile_iterator_strided_dgrad.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/epilogue/threadblock/shared_load_iterator.h b/include/cutlass/epilogue/threadblock/shared_load_iterator.h
index 9224dc20..b9b9b284 100644
--- a/include/cutlass/epilogue/threadblock/shared_load_iterator.h
+++ b/include/cutlass/epilogue/threadblock/shared_load_iterator.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/epilogue/threadblock/shared_load_iterator_mixed.h b/include/cutlass/epilogue/threadblock/shared_load_iterator_mixed.h
index 1440bacb..d845d951 100644
--- a/include/cutlass/epilogue/threadblock/shared_load_iterator_mixed.h
+++ b/include/cutlass/epilogue/threadblock/shared_load_iterator_mixed.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -238,11 +244,12 @@ class SharedLoadIteratorMixed<ThreadMap_, Element_, 32, 16, 8, 8> {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Partial specialization for int32_t x 8 => int8_t x 8
+/// Partial specialization for int32_t x 16 => int8_t/int4b_t x 16 
 template <
-  typename ThreadMap_       ///< Thread map (conept: OutputTileThreadMap)
+  typename ThreadMap_,      ///< Thread map (conept: OutputTileThreadMap)
+  int OutputSizeBits_       ///< Size of output element in bits
 >
-class SharedLoadIteratorMixed<ThreadMap_, int32_t, 32, 8, 16, 8> {
+class SharedLoadIteratorMixed<ThreadMap_, int32_t, 32, OutputSizeBits_, 16, 8> {
 public:
   using ThreadMap = ThreadMap_;
   using Shape = typename ThreadMap::Shape;
@@ -398,11 +405,12 @@ class SharedLoadIteratorMixed<ThreadMap_, int32_t, 32, 8, 16, 8> {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Partial specialization for int32_t x 8 => int8_t x 8
+/// Partial specialization for int32_t x 8 => int8_t/int4b_t x 8
 template <
-  typename ThreadMap_       ///< Thread map (conept: OutputTileThreadMap)
+  typename ThreadMap_,      ///< Thread map (conept: OutputTileThreadMap)
+  int OutputSizeBits_
 >
-class SharedLoadIteratorMixed<ThreadMap_, int32_t, 32, 8, 8, 8> {
+class SharedLoadIteratorMixed<ThreadMap_, int32_t, 32, OutputSizeBits_, 8, 8> {
 public:
   using ThreadMap = ThreadMap_;
   using Shape = typename ThreadMap::Shape;
diff --git a/include/cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h b/include/cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h
index fcade31f..4055f21d 100644
--- a/include/cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h
+++ b/include/cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/epilogue/warp/fragment_iterator_gaussian_complex_tensor_op.h b/include/cutlass/epilogue/warp/fragment_iterator_gaussian_complex_tensor_op.h
index 6d545230..425589fd 100644
--- a/include/cutlass/epilogue/warp/fragment_iterator_gaussian_complex_tensor_op.h
+++ b/include/cutlass/epilogue/warp/fragment_iterator_gaussian_complex_tensor_op.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/epilogue/warp/fragment_iterator_simt.h b/include/cutlass/epilogue/warp/fragment_iterator_simt.h
index 4ba84b25..35e6373a 100644
--- a/include/cutlass/epilogue/warp/fragment_iterator_simt.h
+++ b/include/cutlass/epilogue/warp/fragment_iterator_simt.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/epilogue/warp/fragment_iterator_tensor_op.h b/include/cutlass/epilogue/warp/fragment_iterator_tensor_op.h
index f234806c..e0dd3ee4 100644
--- a/include/cutlass/epilogue/warp/fragment_iterator_tensor_op.h
+++ b/include/cutlass/epilogue/warp/fragment_iterator_tensor_op.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/epilogue/warp/fragment_iterator_volta_tensor_op.h b/include/cutlass/epilogue/warp/fragment_iterator_volta_tensor_op.h
index ef1e19f3..9ca1e121 100644
--- a/include/cutlass/epilogue/warp/fragment_iterator_volta_tensor_op.h
+++ b/include/cutlass/epilogue/warp/fragment_iterator_volta_tensor_op.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/epilogue/warp/fragment_iterator_wmma_tensor_op.h b/include/cutlass/epilogue/warp/fragment_iterator_wmma_tensor_op.h
index 3a381940..7538a224 100644
--- a/include/cutlass/epilogue/warp/fragment_iterator_wmma_tensor_op.h
+++ b/include/cutlass/epilogue/warp/fragment_iterator_wmma_tensor_op.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/epilogue/warp/simt_policy.h b/include/cutlass/epilogue/warp/simt_policy.h
index ca7a486a..5f325266 100644
--- a/include/cutlass/epilogue/warp/simt_policy.h
+++ b/include/cutlass/epilogue/warp/simt_policy.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/epilogue/warp/tensor_op_policy.h b/include/cutlass/epilogue/warp/tensor_op_policy.h
index c9c41978..63b13147 100644
--- a/include/cutlass/epilogue/warp/tensor_op_policy.h
+++ b/include/cutlass/epilogue/warp/tensor_op_policy.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/epilogue/warp/tile_iterator_simt.h b/include/cutlass/epilogue/warp/tile_iterator_simt.h
index 9fa88483..c2d80191 100644
--- a/include/cutlass/epilogue/warp/tile_iterator_simt.h
+++ b/include/cutlass/epilogue/warp/tile_iterator_simt.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/epilogue/warp/tile_iterator_tensor_op.h b/include/cutlass/epilogue/warp/tile_iterator_tensor_op.h
index 5ec15575..24d94c75 100644
--- a/include/cutlass/epilogue/warp/tile_iterator_tensor_op.h
+++ b/include/cutlass/epilogue/warp/tile_iterator_tensor_op.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -260,7 +266,8 @@ class TileIteratorTensorOp<WarpShape_, OperatorShape_, Element_,
   /// This is the fragment size produced by one tile
   using Fragment = Array<
     Element, 
-    Policy::OperatorCount::kRow * Policy::kElementsPerIteration>;
+    Policy::OperatorCount::kRow * Policy::kIterationsPerInstruction 
+        * Policy::kElementsPerIteration>;
 
   /// This is the fragment size produced by one iteration
 //  using Fragment = Array<
diff --git a/include/cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h b/include/cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h
index 4c22a420..eadb779a 100644
--- a/include/cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h
+++ b/include/cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -217,8 +223,18 @@ class TileIteratorTensorOpMixed {
 #if CUTLASS_EPILOGUE_WARP_TILE_ITERATOR_TENSOR_OP_MIXED_OPTIMIZATION_ENABLED
 
     // When the optimization is enabled, small tiles require separate logic.
-    if (WarpShape::kN == 32 && warp_column_ > 0) {
-      ptr = pointers_[1];
+    bool kN32_optimization = (WarpShape::kN * Detail::kLanesInQuad * Policy::kElementsPerAccess * sizeof_bits<Element>::value) % 1024 == 0;
+    if (kN32_optimization) {
+      int ptr_idx = ((warp_column_ * sizeof_bits<Element>::value) / 1024) % Detail::kPointerCount;
+      if (ptr_idx == 0) {
+        ptr = pointers_[0];
+      } else if (ptr_idx == 1) {
+        ptr = pointers_[1];
+      } else if (ptr_idx == 2) {
+        ptr = pointers_[2];
+      } else if (ptr_idx == 3) {
+        ptr = pointers_[3];
+      }
     }
 
 #endif
@@ -234,28 +250,26 @@ class TileIteratorTensorOpMixed {
       if (WarpShape::kN == 64) {
         ptr = pointers_[n / 4];
       }
-      else 
+      else if (!kN32_optimization)
 #endif
-    {
-      // This is the reference implementation
-      int column_idx = warp_column_ + n * Detail::kLanesInQuad * Policy::kElementsPerAccess;
-      int ptr_idx = ((column_idx * sizeof_bits<Element>::value) / 1024) % Detail::kPointerCount;
-
-      if (ptr_idx == 0) {
-        ptr = pointers_[0 % Detail::kPointerCount];
-      }
-      else if (ptr_idx == 1) {
-        ptr = pointers_[1 % Detail::kPointerCount];
-      }
-      else if (ptr_idx == 2) {
-        ptr = pointers_[2 % Detail::kPointerCount];
-      }
-      else if (ptr_idx == 3) {
-        ptr = pointers_[3 % Detail::kPointerCount];
+      {
+        // This is the reference implementation
+        int column_idx = warp_column_ + n * Detail::kLanesInQuad * Policy::kElementsPerAccess;
+        int ptr_idx = ((column_idx * sizeof_bits<Element>::value) / 1024) % Detail::kPointerCount;
+  
+        if (ptr_idx == 0) {
+          ptr = pointers_[0 % Detail::kPointerCount];
+        }
+        else if (ptr_idx == 1) {
+          ptr = pointers_[1 % Detail::kPointerCount];
+        }
+        else if (ptr_idx == 2) {
+          ptr = pointers_[2 % Detail::kPointerCount];
+        }
+        else if (ptr_idx == 3) {
+          ptr = pointers_[3 % Detail::kPointerCount];
+        }
       }
-    }
-
-
 
       int offset = n * Detail::kLanesInQuad + pointer_offset / Policy::kElementsPerAccess;
       ptr[offset] = frag_ptr[n];
@@ -294,12 +308,13 @@ class TileIteratorTensorOpMixed {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Partial specialization for int32_t x 16 => int8_t x 16
+/// Partial specialization for int32_t x 16 => int8_t/int4b_t x 16
 template <
   typename WarpShape_,            ///< shape of warp-level GEMM (concept: GemmShape)
-  typename OperatorShape_         ///< matrix multiply operation shape (concept: gemm::GemmShape)
+  typename OperatorShape_,        ///< matrix multiply operation shape (concept: gemm::GemmShape),
+  int OutputSizeBits              ///< Size of output element in bits
 >
-class TileIteratorTensorOpMixed<WarpShape_, OperatorShape_, int32_t, 32, 8, 16, 8> {
+class TileIteratorTensorOpMixed<WarpShape_, OperatorShape_, int32_t, 32, OutputSizeBits, 16, 8> {
 public:
 
   using WarpShape = WarpShape_;
@@ -495,12 +510,13 @@ class TileIteratorTensorOpMixed<WarpShape_, OperatorShape_, int32_t, 32, 8, 16,
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Partial specialization for int32_t x 8 => int8_t x 8
+/// Partial specialization for int32_t x 8 => int8_t/int4b_t x 8
 template <
   typename WarpShape_,            ///< shape of warp-level GEMM (concept: GemmShape)
-  typename OperatorShape_         ///< matrix multiply operation shape (concept: gemm::GemmShape)
+  typename OperatorShape_,        ///< matrix multiply operation shape (concept: gemm::GemmShape)
+  int OutputSizeBits              ///< Size of output element in bits
 >
-class TileIteratorTensorOpMixed<WarpShape_, OperatorShape_, int32_t, 32, 8, 8, 8> {
+class TileIteratorTensorOpMixed<WarpShape_, OperatorShape_, int32_t, 32, OutputSizeBits, 8, 8> {
 public:
 
   using WarpShape = WarpShape_;
diff --git a/include/cutlass/epilogue/warp/tile_iterator_volta_tensor_op.h b/include/cutlass/epilogue/warp/tile_iterator_volta_tensor_op.h
index ccdfca5b..403f7d89 100644
--- a/include/cutlass/epilogue/warp/tile_iterator_volta_tensor_op.h
+++ b/include/cutlass/epilogue/warp/tile_iterator_volta_tensor_op.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/epilogue/warp/tile_iterator_wmma_tensor_op.h b/include/cutlass/epilogue/warp/tile_iterator_wmma_tensor_op.h
index 88723a7a..630ff73d 100644
--- a/include/cutlass/epilogue/warp/tile_iterator_wmma_tensor_op.h
+++ b/include/cutlass/epilogue/warp/tile_iterator_wmma_tensor_op.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/epilogue/warp/volta_tensor_op_policy.h b/include/cutlass/epilogue/warp/volta_tensor_op_policy.h
index 1526b6ed..8aef282e 100644
--- a/include/cutlass/epilogue/warp/volta_tensor_op_policy.h
+++ b/include/cutlass/epilogue/warp/volta_tensor_op_policy.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/epilogue/warp/wmma_tensor_op_policy.h b/include/cutlass/epilogue/warp/wmma_tensor_op_policy.h
index 9217e5e1..a4524b5b 100644
--- a/include/cutlass/epilogue/warp/wmma_tensor_op_policy.h
+++ b/include/cutlass/epilogue/warp/wmma_tensor_op_policy.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/fast_math.h b/include/cutlass/fast_math.h
index 8ab296c2..14b6522c 100644
--- a/include/cutlass/fast_math.h
+++ b/include/cutlass/fast_math.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -696,7 +702,7 @@ double fast_sqrt(double theta) {
 CUTLASS_HOST_DEVICE
 float fast_exp(float x) {
   #if defined(__CUDA_ARCH__)
-  return ::exp(x);
+  return ::expf(x);
   #else
   return std::exp(x);
   #endif
@@ -705,7 +711,7 @@ float fast_exp(float x) {
 CUTLASS_HOST_DEVICE
 double fast_exp(double x) {
   #if defined(__CUDA_ARCH__)
-  return ::expf(x);
+  return ::exp(x);
   #else
   return std::exp(x);
   #endif
@@ -887,6 +893,18 @@ struct fast_tanh_op<Array<T, N>> {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+/// Absolute value function
+template <typename T>
+CUTLASS_HOST_DEVICE
+T absolute_value(T x) {
+  if (x < T()) {
+    return -x;
+  }
+  return x;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
 }  // namespace cutlass
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/functional.h b/include/cutlass/functional.h
index 63f16c3e..582a8e30 100644
--- a/include/cutlass/functional.h
+++ b/include/cutlass/functional.h
@@ -1,24 +1,30 @@
   /***************************************************************************************************
- * Copyright (c) 2017-2022, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -41,6 +47,14 @@ namespace cutlass {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+template <typename T>
+struct absolute_value_op {
+  CUTLASS_HOST_DEVICE
+  T operator()(T lhs) const {
+    return abs(lhs);
+  }
+};
+
 template <typename T>
 struct plus {
   CUTLASS_HOST_DEVICE
@@ -559,6 +573,24 @@ struct multiply_add<T, complex<T>, complex<T>> {
 //
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+template <typename T, int N>
+struct absolute_value_op< Array<T, N> > {
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs) const {
+
+    Array<T, N> result;
+    absolute_value_op<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i]);
+    }
+
+    return result;
+  }
+};
+
 template <typename T, int N>
 struct plus<Array<T, N>> {
   CUTLASS_HOST_DEVICE
@@ -1773,7 +1805,7 @@ struct multiply_add_relu0<Array<half_t, N>, Array<half_t, N>, Array<half_t, N>>
 
     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N; ++i) {
-      result[i] = mx(op(a[i], b[i], c));
+      result[i] = mx(op(a[i], b[i], c), half_t(0));
     }
     #endif
 
diff --git a/include/cutlass/gemm/device/default_gemm_configuration.h b/include/cutlass/gemm/device/default_gemm_configuration.h
index 02975c32..55592d2b 100644
--- a/include/cutlass/gemm/device/default_gemm_configuration.h
+++ b/include/cutlass/gemm/device/default_gemm_configuration.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -164,7 +170,7 @@ struct DefaultGemmConfiguration<
 
   using ThreadblockShape = GemmShape<128, 256, 32>;
   using WarpShape = GemmShape<64, 64, 32>;
-  using InstructionShape = GemmShape<16, 16, 4>;
+  using InstructionShape = GemmShape<8, 8, 4>;
   static int const kStages = 2;
   
   using EpilogueOutputOp = epilogue::thread::LinearCombination<
diff --git a/include/cutlass/gemm/device/gemm.h b/include/cutlass/gemm/device/gemm.h
index aeb4da4a..045fba74 100644
--- a/include/cutlass/gemm/device/gemm.h
+++ b/include/cutlass/gemm/device/gemm.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -213,7 +219,13 @@ template <
     /// Operation performed by GEMM
     typename Operator_ = typename DefaultGemmConfiguration<
         OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
-        ElementAccumulator_>::Operator>
+        ElementAccumulator_>::Operator,
+    /// Gather operand A by using an index array
+    bool GatherA = false,
+    /// Gather operand B by using an index array
+    bool GatherB = false,
+    /// Scatter result D by using an index array
+    bool ScatterD = false>
 class Gemm {
  public:
 
@@ -264,7 +276,11 @@ class Gemm {
     ThreadblockSwizzle,
     kStages,
     kSplitKSerial,
-    Operator
+    Operator,
+    SharedMemoryClearOption::kNone,
+    GatherA,
+    GatherB,
+    ScatterD
   >::GemmKernel;
 
   /// Argument structure
@@ -281,6 +297,10 @@ class Gemm {
     TensorRef<ElementC, LayoutC> ref_D;
     typename EpilogueOutputOp::Params epilogue;
     int split_k_slices;
+    // For gather+scatter operations
+    int const *gather_A_indices;
+    int const *gather_B_indices;
+    int const *scatter_D_indices;
 
     //
     // Methods
@@ -302,7 +322,10 @@ class Gemm {
       TensorRef<ElementC, LayoutC> ref_D_,
       typename EpilogueOutputOp::Params epilogue_ = 
         typename EpilogueOutputOp::Params(),
-      int split_k_slices = 1
+      int split_k_slices = 1,
+      int const *gather_A_indices_ = nullptr,
+      int const *gather_B_indices_ = nullptr,
+      int const *scatter_D_indices_ = nullptr
     ):
       problem_size(problem_size_),
       ref_A(ref_A_),
@@ -310,7 +333,10 @@ class Gemm {
       ref_C(ref_C_),
       ref_D(ref_D_),
       epilogue(epilogue_),
-      split_k_slices(split_k_slices) {
+      split_k_slices(split_k_slices),
+      gather_A_indices(gather_A_indices_),
+      gather_B_indices(gather_B_indices_),
+      scatter_D_indices(scatter_D_indices_) {
 
     }
   };
@@ -410,7 +436,10 @@ class Gemm {
       args.ref_C.non_const_ref(),
       args.ref_D,
       args.epilogue,
-      static_cast<int *>(workspace)
+      static_cast<int *>(workspace),
+      args.gather_A_indices,
+      args.gather_B_indices,
+      args.scatter_D_indices
     };
 
     return Status::kSuccess;
@@ -524,13 +553,20 @@ template <
     /// If true, kernel supports split-K as a serial reduction
     bool SplitKSerial,
     /// Operation performed by GEMM
-    typename Operator_>
+    typename Operator_,
+    /// Gather operand A by using an index array
+    bool GatherA,
+    /// Gather operand B by using an index array
+    bool GatherB,
+    /// Scatter result D by using an index array
+    bool ScatterD
+>
 class Gemm<ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_,
            layout::ColumnMajor,  // partially specialized on LayoutC
            ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_,
            WarpShape_, InstructionShape_, EpilogueOutputOp_,
            ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial,
-           Operator_> {
+           Operator_, GatherA, GatherB, ScatterD> {
  public:
 
   using ElementA = ElementA_;
@@ -578,7 +614,10 @@ class Gemm<ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_,
     kAlignmentB,
     kAlignmentA,
     SplitKSerial,
-    Operator
+    Operator,
+    GatherB,
+    GatherA,
+    ScatterD
   >;
 
   using UnderlyingArguments = typename UnderlyingOperator::Arguments;
@@ -599,6 +638,10 @@ class Gemm<ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_,
     TensorRef<ElementC, LayoutC> ref_D;
     typename EpilogueOutputOp::Params epilogue;
     int split_k_slices;
+    // For gather+scatter operations
+    int *gather_A_indices;
+    int *gather_B_indices;
+    int *scatter_D_indices;
 
     //
     // Methods
@@ -618,7 +661,10 @@ class Gemm<ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_,
       TensorRef<ElementC, LayoutC> ref_D_,
       typename EpilogueOutputOp::Params epilogue_ = 
         typename EpilogueOutputOp::Params(),
-      int split_k_slices = 1
+      int split_k_slices = 1,
+      int *gather_A_indices_ = nullptr,
+      int *gather_B_indices_ = nullptr,
+      int *scatter_D_indices_ = nullptr
     ):
       problem_size(problem_size_),
       ref_A(ref_A_),
@@ -626,7 +672,10 @@ class Gemm<ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_,
       ref_C(ref_C_),
       ref_D(ref_D_),
       epilogue(epilogue_),
-      split_k_slices(split_k_slices) { }
+      split_k_slices(split_k_slices),
+      gather_A_indices(gather_A_indices_),
+      gather_B_indices(gather_B_indices_),
+      scatter_D_indices(scatter_D_indices_) { }
   };
 
 private:
@@ -647,7 +696,10 @@ class Gemm<ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_,
       {args.ref_C.data(), args.ref_C.stride(0)},
       {args.ref_D.data(), args.ref_D.stride(0)},
       args.epilogue,
-      args.split_k_slices
+      args.split_k_slices,
+      args.gather_B_indices,
+      args.gather_A_indices,
+      args.scatter_D_indices
     );
   }
 
diff --git a/include/cutlass/gemm/device/gemm_array.h b/include/cutlass/gemm/device/gemm_array.h
index fa7e8e59..ac47a5c9 100644
--- a/include/cutlass/gemm/device/gemm_array.h
+++ b/include/cutlass/gemm/device/gemm_array.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -353,13 +359,6 @@ class GemmArray {
       return Status::kErrorMisalignedOperand;
     }
 
-    if ((args.problem_size.m() % kAlignmentA) || (args.problem_size.k() % kAlignmentA) ||
-      (args.problem_size.n() % kAlignmentB) || (args.problem_size.k() % kAlignmentB) ||
-      (args.problem_size.m() % kAlignmentC) || (args.problem_size.n() % kAlignmentC)) {
-
-      return Status::kErrorMisalignedOperand;
-    }
-
     return Status::kSuccess;
   }
 
diff --git a/include/cutlass/gemm/device/gemm_batched.h b/include/cutlass/gemm/device/gemm_batched.h
index afa22fe9..fe265b5a 100644
--- a/include/cutlass/gemm/device/gemm_batched.h
+++ b/include/cutlass/gemm/device/gemm_batched.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -348,13 +354,6 @@ class GemmBatched {
       return Status::kErrorMisalignedOperand;
     }
 
-    if ((args.problem_size.m() % kAlignmentA) || (args.problem_size.k() % kAlignmentA) ||
-      (args.problem_size.n() % kAlignmentB) || (args.problem_size.k() % kAlignmentB) ||
-      (args.problem_size.m() % kAlignmentC) || (args.problem_size.n() % kAlignmentC)) {
-
-      return Status::kErrorMisalignedOperand;
-    }
-
     return Status::kSuccess;
   }
 
diff --git a/include/cutlass/gemm/device/gemm_complex.h b/include/cutlass/gemm/device/gemm_complex.h
index be4e19b6..a2306361 100644
--- a/include/cutlass/gemm/device/gemm_complex.h
+++ b/include/cutlass/gemm/device/gemm_complex.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/device/gemm_grouped.h b/include/cutlass/gemm/device/gemm_grouped.h
index 11290237..f489ba93 100644
--- a/include/cutlass/gemm/device/gemm_grouped.h
+++ b/include/cutlass/gemm/device/gemm_grouped.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -60,28 +66,41 @@ class GemmGrouped {
 public:
 
   using GemmKernel = GemmKernel_;
-  using ThreadblockShape = typename GemmKernel::Mma::Shape;
   
   using ElementA = typename GemmKernel::ElementA;
   using LayoutA = typename GemmKernel::LayoutA;
   using TensorRefA = TensorRef<ElementA const, LayoutA>;
   static ComplexTransform const kTransformA = GemmKernel::kTransformA;
+  static int const kAlignmentA = GemmKernel::kAlignmentA;
 
   using ElementB = typename GemmKernel::ElementB;
   using LayoutB = typename GemmKernel::LayoutB;
   using TensorRefB = TensorRef<ElementB const, LayoutB>;
   static ComplexTransform const kTransformB = GemmKernel::kTransformB;
+  static int const kAlignmentB = GemmKernel::kAlignmentB;
 
   using ElementC = typename GemmKernel::ElementC;
   using LayoutC = typename GemmKernel::LayoutC;
   using TensorRefC = TensorRef<ElementC const, LayoutC>;
   using TensorRefD = TensorRef<ElementC, LayoutC>;
+  static int const kAlignmentC = GemmKernel::kAlignmentC;
 
   using ElementAccumulator = typename GemmKernel::Mma::Policy::Operator::ElementC;
 
   using EpilogueOutputOp = typename GemmKernel::EpilogueOutputOp;
   using ThreadblockSwizzle = typename GemmKernel::ThreadblockSwizzle;
+
   using Operator = typename GemmKernel::Operator;
+  using WarpMmaOperator = typename GemmKernel::Mma::Policy::Operator;
+
+  using ArchMmaOperator = typename WarpMmaOperator::ArchMmaOperator;
+  using MathOperator = typename WarpMmaOperator::MathOperator;
+  using OperatorClass = typename WarpMmaOperator::OperatorClass;
+  using ArchTag = typename WarpMmaOperator::ArchTag;
+  using ThreadblockShape = typename GemmKernel::Mma::Shape;
+  using WarpShape = typename GemmKernel::WarpShape;
+  using InstructionShape = typename GemmKernel::InstructionShape;
+  static int const kStages = GemmKernel::Mma::kStages;
 
   /// Argument structure
   using Arguments = typename GemmKernel::Arguments;
@@ -99,21 +118,6 @@ class GemmGrouped {
   /// Determines whether the GEMM can execute the given problem.
   static Status can_implement(Arguments const &args) {
     
-    // Determine grid shape
-    cutlass::gemm::GemmCoord grid_tiled_shape;
-    int gemm_k_size = 0;
-    
-    get_grid_shape_(grid_tiled_shape, gemm_k_size, args);
-    
-    ThreadblockSwizzle threadblock_swizzle;
-    dim3 grid = threadblock_swizzle.get_grid_shape(grid_tiled_shape);
-  
-    if (!(grid.y <= std::numeric_limits<uint16_t>::max() && 
-          grid.z <= std::numeric_limits<uint16_t>::max())) {
-
-      return Status::kErrorInvalidProblem;
-    } 
-
     return GemmKernel::can_implement(args);
   }
 
diff --git a/include/cutlass/gemm/device/gemm_sparse.h b/include/cutlass/gemm/device/gemm_sparse.h
index 581ca41d..34341771 100644
--- a/include/cutlass/gemm/device/gemm_sparse.h
+++ b/include/cutlass/gemm/device/gemm_sparse.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/device/gemm_splitk_parallel.h b/include/cutlass/gemm/device/gemm_splitk_parallel.h
index acf2cefa..243a808a 100644
--- a/include/cutlass/gemm/device/gemm_splitk_parallel.h
+++ b/include/cutlass/gemm/device/gemm_splitk_parallel.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/device/gemm_universal.h b/include/cutlass/gemm/device/gemm_universal.h
index 224666c2..ddd997a8 100644
--- a/include/cutlass/gemm/device/gemm_universal.h
+++ b/include/cutlass/gemm/device/gemm_universal.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -111,10 +117,16 @@ template <
     /// Complex elementwise transformation on A operand
     ComplexTransform TransformA = ComplexTransform::kNone,
     /// Complex elementwise transformation on B operand
-    ComplexTransform TransformB = ComplexTransform::kNone
+    ComplexTransform TransformB = ComplexTransform::kNone,
+    /// Gather operand A by using an index array
+    bool GatherA = false,
+    /// Gather operand B by using an index array
+    bool GatherB = false,
+    /// Scatter result D by using an index array
+    bool ScatterD = false
 >
 class GemmUniversal : 
-  GemmUniversalBase<
+  public GemmUniversalBase<
     typename kernel::DefaultGemmUniversal<
       ElementA_,
       LayoutA_,
@@ -135,7 +147,11 @@ class GemmUniversal :
       EpilogueOutputOp_,
       ThreadblockSwizzle_,
       Stages,
-      Operator_
+      Operator_,
+      SharedMemoryClearOption::kNone,
+      GatherA,
+      GatherB,
+      ScatterD
     >::GemmKernel
   > {
 
@@ -178,7 +194,11 @@ class GemmUniversal :
       EpilogueOutputOp_,
       ThreadblockSwizzle_,
       Stages,
-      Operator_
+      Operator_,
+      SharedMemoryClearOption::kNone,
+      GatherA,
+      GatherB,
+      ScatterD
     >::GemmKernel
   >;
 
@@ -229,13 +249,20 @@ template <
     /// Complex elementwise transformation on A operand
     ComplexTransform TransformA,
     /// Complex elementwise transformation on B operand
-    ComplexTransform TransformB>
+    ComplexTransform TransformB,
+    /// Gather operand A by using an index array
+    bool GatherA,
+    /// Gather operand B by using an index array
+    bool GatherB,
+    /// Scatter result D by using an index array
+    bool ScatterD
+>
 class GemmUniversal<ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_,
            layout::ColumnMajor,  // partially specialized on LayoutC
            ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_,
            WarpShape_, InstructionShape_, EpilogueOutputOp_,
            ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB,
-           Operator_, TransformA, TransformB> {
+           Operator_, TransformA, TransformB, GatherA, GatherB, ScatterD> {
  public:
 
   using ElementA = ElementA_;
@@ -283,7 +310,10 @@ class GemmUniversal<ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_,
     kAlignmentA,
     Operator,
     kTransformB,
-    kTransformA
+    kTransformA,
+    GatherB,
+    GatherA,
+    ScatterD
   >::Base;
 
   using GemmKernel = typename UnderlyingOperator::GemmKernel;
diff --git a/include/cutlass/gemm/device/gemm_universal_adapter.h b/include/cutlass/gemm/device/gemm_universal_adapter.h
index 28375af2..95196ae2 100644
--- a/include/cutlass/gemm/device/gemm_universal_adapter.h
+++ b/include/cutlass/gemm/device/gemm_universal_adapter.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -49,7 +55,7 @@ class GemmUniversalAdapter {
   using GemmKernel = GemmKernel_;
 
   static bool const kInternalTranspose = 
-    std::is_same<typename GemmKernel::LayoutC, cutlass::layout::RowMajor>::value;
+    platform::is_same<typename GemmKernel::LayoutC, cutlass::layout::RowMajor>::value;
 
   using ThreadblockShape = typename GemmKernel::Mma::Shape;
   using WarpShape = typename GemmKernel::WarpShape;
diff --git a/include/cutlass/gemm/device/gemm_universal_base.h b/include/cutlass/gemm/device/gemm_universal_base.h
index aa85a514..f338950c 100644
--- a/include/cutlass/gemm/device/gemm_universal_base.h
+++ b/include/cutlass/gemm/device/gemm_universal_base.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -30,7 +36,7 @@
 
 #pragma once
 
-#include <limits>
+//#include <limits>
 
 #include "cutlass/cutlass.h"
 #include "cutlass/numeric_types.h"
@@ -134,9 +140,10 @@ class GemmUniversalBase {
     
     ThreadblockSwizzle threadblock_swizzle;
     dim3 grid = threadblock_swizzle.get_grid_shape(grid_tiled_shape);
+
+    uint32_t const kGridYZMax = ((1 << (sizeof(uint16_t) * 8)) - 1);
   
-    if (!(grid.y <= std::numeric_limits<uint16_t>::max() && 
-          grid.z <= std::numeric_limits<uint16_t>::max())) {
+    if (!(grid.y <= kGridYZMax && grid.z <= kGridYZMax)) {
 
       return Status::kErrorInvalidProblem;
     } 
diff --git a/include/cutlass/gemm/device/gemv.h b/include/cutlass/gemm/device/gemv.h
index db39efa3..18d98f2e 100644
--- a/include/cutlass/gemm/device/gemv.h
+++ b/include/cutlass/gemm/device/gemv.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/device/rank_2k.h b/include/cutlass/gemm/device/rank_2k.h
new file mode 100644
index 00000000..3630406e
--- /dev/null
+++ b/include/cutlass/gemm/device/rank_2k.h
@@ -0,0 +1,547 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined Rank2K kernel. Does not compute batching or support split-K.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/rank_2k_universal.h"
+
+#include "cutlass/gemm/kernel/default_rank_2k_universal.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassTensorOp,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_ = arch::Sm80,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementA_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementA_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementA_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementA_, ElementC_,
+        ElementAccumulator_>::EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ =
+        typename threadblock::GemmIdentityThreadblockSwizzle<>,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementA_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementA_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementB_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentB,
+    /// If true, kernel supports split-K with serial reduction
+    bool SplitKSerial = false,
+    /// Operation performed by SYRK
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::Operator,
+    /// Complex elementwise transformation 
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// Complex elementwise transformation 
+    ComplexTransform TransformB = ComplexTransform::kNone,
+    /// Blas3 computation mode (symmetric/hermitian)
+    BlasMode BlasMode_ = BlasMode::kSymmetric>
+class Rank2K {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static FillMode const kFillModeC = FillModeC;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+  static bool const kSplitKSerial = SplitKSerial;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+  static BlasMode const kBlasMode = BlasMode_;
+  static int const kUpdateRank = 2;
+
+  // static asserts for rank 2k update kernel
+  static_assert(std::is_same<LayoutA, LayoutB>::value,
+    "Rank 2K update operator support same layouts for operandA and B");
+
+  /// Define the kernel
+  using Rank2Kkernel = typename kernel::DefaultRank2KUniversal<
+    ElementA,
+    LayoutA,
+    kTransformA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kTransformB,
+    kAlignmentB,
+    ElementC,
+    LayoutC,
+    kFillModeC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    kStages,
+    kSplitKSerial,
+    Operator,
+    kBlasMode
+  >::Rank2Kkernel;
+  
+  using Arguments = typename Rank2Kkernel::Arguments;
+
+private:
+
+  /// Kernel parameters object
+  typename Rank2Kkernel::Params params_;
+public:
+
+  /// Constructs the SYRK.
+  Rank2K() { }
+
+  /// Determines whether the SYRK can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    if (!kSplitKSerial && args.batch_count > 1) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    Status status = Rank2Kkernel::can_implement(args);
+   
+    if (FillModeC != FillMode::kLower && FillModeC != FillMode::kUpper) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    
+    size_t bytes = 0;
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord tiled_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size, 
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+      args.batch_count);
+    
+    if (kSplitKSerial && args.batch_count > 1) {
+
+      bytes += sizeof(int) * size_t(tiled_shape.m()) * size_t(tiled_shape.n());
+    }
+
+    return bytes;
+  }
+
+  /// Initializes SYRK state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+    
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size, 
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+      args.batch_count);
+
+    if (kSplitKSerial) {
+      if (args.batch_count > 1) {
+        if (!workspace) {
+          return Status::kErrorWorkspaceNull;
+        }
+
+        size_t bytes = get_workspace_size(args);
+      
+        cudaError_t result = cudaMemsetAsync(workspace, 0, bytes, stream);
+
+        if (result != cudaSuccess) {
+          return Status::kErrorInternal;
+        }
+      }
+    }
+    else {
+
+      if (args.batch_count > 1) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+    
+    int gemm_k_size = args.problem_size.k();
+
+    // Initialize the Params structure
+    params_ = typename Rank2Kkernel::Params{
+      args,
+      grid_tiled_shape,
+      gemm_k_size,
+      static_cast<int *>(workspace)
+    };
+    
+    int smem_size = int(sizeof(typename Rank2Kkernel::SharedStorage));
+    
+    if (smem_size >= (48 << 10)) {
+      cudaError_t result = cudaFuncSetAttribute(Kernel<Rank2Kkernel>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    smem_size);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+    
+    if (kSplitKSerial && args.batch_count > 1) {  
+      if (!workspace) {
+        return Status::kErrorWorkspaceNull;
+      }
+    }
+
+    size_t workspace_bytes = get_workspace_size(args);
+
+    if (workspace_bytes && !workspace) {
+      return Status::kErrorWorkspaceNull;
+    }
+
+    params_.update(args, workspace);
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
+    dim3 block(Rank2Kkernel::kThreadCount, 1, 1);
+
+    int smem_size = int(sizeof(typename Rank2Kkernel::SharedStorage));
+
+    cutlass::Kernel<Rank2Kkernel><<<grid, block, smem_size, stream>>>(params_);
+
+    cudaError_t result = cudaGetLastError();
+
+    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+
+/// Parital specialization for column-major output exchange operand.
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for.  This is the minimum SM that
+    /// supports the intended feature. The device kernel can be built
+    /// targeting any SM larger than this number.
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB,
+    /// If true, kernel supports split-K with serial reduction
+    bool SplitKSerial,
+    /// Operation performed by Rank2K update kernel
+    typename Operator_,
+    /// Complex elementwise transformation 
+    ComplexTransform TransformA,
+    /// Complex elementwise transformation 
+    ComplexTransform TransformB,
+    /// Blas3 computation mode (symmetric/hermitian)
+    BlasMode BlasMode_
+    >
+class Rank2K<ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_,
+           layout::ColumnMajor,  // partially specialized on LayoutC
+           FillModeC, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_,
+           WarpShape_, InstructionShape_, EpilogueOutputOp_,
+           ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB,
+           SplitKSerial, Operator_, TransformA, TransformB, BlasMode_> {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using ElementC = ElementC_;
+  using LayoutC = layout::ColumnMajor;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static FillMode const kFillModeC = FillModeC;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+  static bool const kSplitKSerial = SplitKSerial;
+  static BlasMode const kBlasMode = BlasMode_;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+  static int const kUpdateRank = 2;
+  
+  /// Define the kernel
+  using UnderlyingOperator = typename cutlass::gemm::device::Rank2K<
+    ElementB,
+    LayoutB,
+    ElementA,
+    LayoutA,
+    ElementC,
+    layout::RowMajor,
+    InvertFillMode<FillModeC>::mode,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    kStages,
+    kAlignmentB,
+    kAlignmentA,
+    kSplitKSerial,
+    Operator,
+    kTransformA,
+    kTransformB,
+    kBlasMode
+  >;
+  
+
+  /// Argument structure
+  using Arguments = typename UnderlyingOperator::Arguments;
+  using Rank2Kkernel = typename UnderlyingOperator::Rank2Kkernel;
+
+private:
+
+  UnderlyingOperator underlying_operator_;
+
+public:
+
+  /// Constructs the Rank2K.
+  Rank2K() { }
+
+  /// Helper to construct a transposed equivalent for the underying Rank2K operator
+  static Arguments to_underlying_arguments(Arguments const &args) {
+    return args.transposed_problem();
+  }
+
+  /// Determines whether the Rank2K can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    return UnderlyingOperator::can_implement(to_underlying_arguments(args));
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    
+    return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args));
+  }
+
+  /// Computes the grid shape
+  static dim3 get_grid_shape(Arguments const &args) { 
+    return UnderlyingOperator::get_grid_shape(to_underlying_arguments(args));
+  }
+
+  /// Computes the maximum number of active blocks per multiprocessor
+  static int maximum_active_blocks(int smem_capacity = -1) {
+    return UnderlyingOperator::maximum_active_blocks(smem_capacity);
+  }
+
+  /// Initializes Rank2K state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.initialize(to_underlying_arguments(args), workspace, stream);
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    return underlying_operator_.update(to_underlying_arguments(args), workspace);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace, stream);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace Rank2K
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/device/rank_k.h b/include/cutlass/gemm/device/rank_k.h
new file mode 100644
index 00000000..a85c6958
--- /dev/null
+++ b/include/cutlass/gemm/device/rank_k.h
@@ -0,0 +1,509 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined RankK kernel. Does not compute batching or support split-K.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/rank_k_universal.h"
+
+#include "cutlass/gemm/kernel/default_rank_k_universal.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassTensorOp,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_ = arch::Sm80,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementA_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementA_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementA_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementA_, ElementC_,
+        ElementAccumulator_>::EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ =
+        typename threadblock::GemmIdentityThreadblockSwizzle<>,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementA_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementA_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+    /// If true, kernel supports split-K with serial reduction
+    bool SplitKSerial = false,
+    /// Operation performed by SYRK
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementA_, ElementC_,
+        ElementAccumulator_>::Operator,
+    /// Complex elementwise transformation 
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// Blas3 computation mode (symmetric/hermitian)
+    BlasMode BlasMode_ = BlasMode::kSymmetric>
+class RankK {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static FillMode const kFillModeC = FillModeC;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+  static bool const kSplitKSerial = SplitKSerial;
+  static ComplexTransform const kTransformA = TransformA;
+  static BlasMode const kBlasMode = BlasMode_;
+  static int const kUpdateRank = 1;
+
+  /// Define the kernel
+  using RankKkernel = typename kernel::DefaultRankKUniversal<
+    ElementA,
+    LayoutA,
+    kTransformA,
+    kAlignmentA,
+    ElementC,
+    LayoutC,
+    kFillModeC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    kStages,
+    kSplitKSerial,
+    Operator,
+    kBlasMode
+  >::RankKkernel;
+  
+  using Arguments = typename RankKkernel::Arguments;
+
+private:
+
+  /// Kernel parameters object
+  typename RankKkernel::Params params_;
+public:
+
+  /// Constructs the SYRK.
+  RankK() { }
+
+  /// Determines whether the SYRK can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    if (!kSplitKSerial && args.batch_count > 1) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    Status status = RankKkernel::can_implement(args);
+   
+    if (FillModeC != FillMode::kLower && FillModeC != FillMode::kUpper) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    
+    size_t bytes = 0;
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord tiled_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size, 
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+      args.batch_count);
+    
+    if (kSplitKSerial && args.batch_count > 1) {
+
+      bytes += sizeof(int) * size_t(tiled_shape.m()) * size_t(tiled_shape.n());
+    }
+
+    return bytes;
+  }
+
+  /// Initializes SYRK state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+    
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size, 
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+      args.batch_count);
+
+    if (kSplitKSerial) {
+      if (args.batch_count > 1) {
+        if (!workspace) {
+          return Status::kErrorWorkspaceNull;
+        }
+
+        size_t bytes = get_workspace_size(args);
+      
+        cudaError_t result = cudaMemsetAsync(workspace, 0, bytes, stream);
+
+        if (result != cudaSuccess) {
+          return Status::kErrorInternal;
+        }
+      }
+    }
+    else {
+
+      if (args.batch_count > 1) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+    
+    int gemm_k_size = args.problem_size.k();
+
+    // Initialize the Params structure
+    params_ = typename RankKkernel::Params{
+      args,
+      grid_tiled_shape,
+      gemm_k_size,
+      static_cast<int *>(workspace)
+    };
+    
+    int smem_size = int(sizeof(typename RankKkernel::SharedStorage));
+    
+    if (smem_size >= (48 << 10)) {
+      cudaError_t result = cudaFuncSetAttribute(Kernel<RankKkernel>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    smem_size);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+    
+    if (kSplitKSerial && args.batch_count > 1) {  
+      if (!workspace) {
+        return Status::kErrorWorkspaceNull;
+      }
+    }
+
+    size_t workspace_bytes = get_workspace_size(args);
+
+    if (workspace_bytes && !workspace) {
+      return Status::kErrorWorkspaceNull;
+    }
+
+    params_.update(args, workspace);
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
+    dim3 block(RankKkernel::kThreadCount, 1, 1);
+
+    int smem_size = int(sizeof(typename RankKkernel::SharedStorage));
+
+    cutlass::Kernel<RankKkernel><<<grid, block, smem_size, stream>>>(params_);
+
+    cudaError_t result = cudaGetLastError();
+
+    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+
+/// Parital specialization for column-major output exchange operand.
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for.  This is the minimum SM that
+    /// supports the intended feature. The device kernel can be built
+    /// targeting any SM larger than this number.
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA,
+    /// If true, kernel supports split-K with serial reduction
+    bool SplitKSerial,
+    /// Operation performed by RankK update kernel
+    typename Operator_,
+    /// Complex elementwise transformation 
+    ComplexTransform TransformA,
+    /// Blas3 computation mode (symmetric/hermitian)
+    BlasMode BlasMode_
+    >
+class RankK<ElementA_, LayoutA_, ElementC_,
+           layout::ColumnMajor,  // partially specialized on LayoutC
+           FillModeC, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_,
+           WarpShape_, InstructionShape_, EpilogueOutputOp_,
+           ThreadblockSwizzle_, Stages, AlignmentA,
+           SplitKSerial, Operator_, TransformA, BlasMode_> {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using ElementC = ElementC_;
+  using LayoutC = layout::ColumnMajor;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static FillMode const kFillModeC = FillModeC;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+  static bool const kSplitKSerial = SplitKSerial;
+  static BlasMode const kBlasMode = BlasMode_;
+  static int const kUpdateRank = 1;
+
+  // Complex transform for input A matrices (function on input layout)
+  static ComplexTransform const kTransformA = TransformA;
+  
+  /// Define the kernel
+  using UnderlyingOperator = typename cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    layout::RowMajor,
+    InvertFillMode<FillModeC>::mode,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    kStages,
+    kAlignmentA,
+    kSplitKSerial,
+    Operator,
+    kTransformA,
+    kBlasMode
+  >;
+  
+
+  /// Argument structure
+  using Arguments = typename UnderlyingOperator::Arguments;
+  using RankKkernel = typename UnderlyingOperator::RankKkernel;
+
+private:
+
+  UnderlyingOperator underlying_operator_;
+
+public:
+
+  /// Constructs the RankK.
+  RankK() { }
+
+  /// Helper to construct a transposed equivalent for the underying RankK operator
+  static Arguments to_underlying_arguments(Arguments const &args) {
+    return args;
+  }
+
+  /// Determines whether the RankK can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    return UnderlyingOperator::can_implement(to_underlying_arguments(args));
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    
+    return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args));
+  }
+
+  /// Computes the grid shape
+  static dim3 get_grid_shape(Arguments const &args) { 
+    return UnderlyingOperator::get_grid_shape(to_underlying_arguments(args));
+  }
+
+  /// Computes the maximum number of active blocks per multiprocessor
+  static int maximum_active_blocks(int smem_capacity = -1) {
+    return UnderlyingOperator::maximum_active_blocks(smem_capacity);
+  }
+
+  /// Initializes RankK state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.initialize(to_underlying_arguments(args), workspace, stream);
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    return underlying_operator_.update(to_underlying_arguments(args), workspace);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace, stream);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace RankK
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/device/symm.h b/include/cutlass/gemm/device/symm.h
new file mode 100755
index 00000000..99bcd16a
--- /dev/null
+++ b/include/cutlass/gemm/device/symm.h
@@ -0,0 +1,602 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined SYMM and HEMM kernels. Does not compute batching or support split-K.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/symm_universal.h"
+
+#include "cutlass/gemm/kernel/default_symm_universal.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Side Mode for A (kLeft or kRight)
+    SideMode SideModeA,
+    /// Fill Mode for A (kLower or kUpper)
+    FillMode FillModeA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassTensorOp,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_ = arch::Sm80,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_ = epilogue::thread::LinearCombination<
+      ElementC_,
+      128 / sizeof_bits<ElementC_>::value,
+      ElementAccumulator_,
+      ElementAccumulator_,
+      epilogue::thread::ScaleType::OnlyAlphaScaling
+    >,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ = threadblock::GemmIdentityThreadblockSwizzle<>,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentB,
+    /// If true, kernel supports split-K with serial reduction
+    bool SplitKSerial = false,
+    /// Operation performed by SYMM
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::Operator,
+    /// Blas3 computation mode (symmetric/hermitian)
+    BlasMode BlasMode_ = BlasMode::kSymmetric>
+class Symm {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using ElementAKernel = typename platform::conditional<(SideModeA == SideMode::kRight), ElementB_, ElementA_>::type;
+  using LayoutAKernel = typename platform::conditional<(SideModeA == SideMode::kRight), LayoutB_, LayoutA_>::type;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using ElementBKernel = typename platform::conditional<(SideModeA == SideMode::kRight), ElementA_, ElementB_>::type;
+  using LayoutBKernel = typename platform::conditional<(SideModeA == SideMode::kRight), LayoutA_, LayoutB_>::type;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static SideMode const kSideModeA = SideModeA;
+  static FillMode const kFillModeA = FillModeA;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentAKernel = (SideModeA == SideMode::kRight) ? AlignmentB : AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentBKernel = (SideModeA == SideMode::kRight) ? AlignmentA : AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+  static bool const kSplitKSerial = SplitKSerial;
+  static BlasMode const kBlasMode = BlasMode_;
+
+  // static asserts for symm update kernel
+  static_assert(std::is_same<LayoutA, LayoutB>::value,
+    "SYMM update operator support same layouts for operand A and B");
+
+  /// Define the kernel
+  using SymmKernel = typename kernel::DefaultSymmUniversal<
+    ElementAKernel,
+    LayoutAKernel,
+    kSideModeA,
+    kFillModeA,
+    kAlignmentAKernel,
+    ElementBKernel,
+    LayoutBKernel,
+    kAlignmentBKernel,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    kStages,
+    kSplitKSerial,
+    Operator,
+    kBlasMode
+  >::SymmKernel;
+  
+  using Arguments = typename SymmKernel::Arguments;
+
+private:
+
+  /// Kernel parameters object
+  typename SymmKernel::Params params_;
+public:
+
+  /// Constructs the SYMM.
+  Symm() { }
+
+  /// Determines whether the SYMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    if (!kSplitKSerial && args.batch_count > 1) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    Status status = SymmKernel::can_implement(args);
+
+    if (SideModeA == SideMode::kInvalid) {
+      return Status::kErrorInvalidProblem;
+    }
+   
+    if (FillModeA != FillMode::kLower && FillModeA != FillMode::kUpper) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    
+    size_t bytes = 0;
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord tiled_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size, 
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+      args.batch_count);
+    
+    if (kSplitKSerial && args.batch_count > 1) {
+
+      bytes += sizeof(int) * size_t(tiled_shape.m()) * size_t(tiled_shape.n());
+    }
+
+    return bytes;
+  }
+
+  /// Initializes SYMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+    
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size, 
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+      args.batch_count);
+
+    if (kSplitKSerial) {
+      if (args.batch_count > 1) {
+        if (!workspace) {
+          return Status::kErrorWorkspaceNull;
+        }
+
+        size_t bytes = get_workspace_size(args);
+      
+        cudaError_t result = cudaMemsetAsync(workspace, 0, bytes, stream);
+
+        if (result != cudaSuccess) {
+          return Status::kErrorInternal;
+        }
+      }
+    }
+    else {
+
+      if (args.batch_count > 1) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+    
+    int gemm_k_size = args.problem_size.k();
+
+   // Swapping argument for A and B, if A was on the right side (problem size doesn't need to change here).
+    if (kSideModeA == SideMode::kRight) {
+      // Initialize the Params structure
+      params_ = typename SymmKernel::Params{
+        args.swapped_matrices(),
+        grid_tiled_shape,
+        gemm_k_size,
+        static_cast<int *>(workspace)
+      };
+
+      return Status::kSuccess;
+    }
+
+    // Initialize the Params structure
+    params_ = typename SymmKernel::Params{
+      args,
+      grid_tiled_shape,
+      gemm_k_size,
+      static_cast<int *>(workspace)
+    };
+    
+    return Status::kSuccess;
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+    
+    if (kSplitKSerial && args.batch_count > 1) {  
+      if (!workspace) {
+        return Status::kErrorWorkspaceNull;
+      }
+    }
+
+    size_t workspace_bytes = get_workspace_size(args);
+
+    if (workspace_bytes && !workspace) {
+      return Status::kErrorWorkspaceNull;
+    }
+
+    params_.update(args, workspace);
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
+    dim3 block(SymmKernel::kThreadCount, 1, 1);
+
+    int smem_size = int(sizeof(typename SymmKernel::SharedStorage));
+
+    if (smem_size >= (48 << 10)) {
+      cudaError_t result = cudaFuncSetAttribute(Kernel<SymmKernel>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    smem_size);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+
+    cutlass::Kernel<SymmKernel><<<grid, block, smem_size, stream>>>(params_);
+
+    cudaError_t result = cudaGetLastError();
+
+    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+
+/********************************************************************************************************
+  SYMM/HEMM has 4 combinations based on Layouts {RowMajor, ColumnMajor} x Side mode {LeftSide, RightSide}
+  In templates and arguments to cutlass kernel, `matrix A` is always symmetric/hermitian, and `matrix B` is rectangular. 
+  (adhering to the cuBLAS convention)
+
+  Although, cuBLAS SYMM/HEMM only supports ColumnMajor layouts for all matrices (A, B, C/D).
+
+  For the mainloop and symm kernel, `A` and `B` points to left-side and right-side matrices, respectively.
+  
+  Thus, for LeftSide mode `A` and `B` points to `matrix A` and `matrix B`, respectively. While for 
+  the RightSide mode `A` and `B` points to `matrix B` and `matrix A`, respectively. 
+  
+  Additionally, CUTLASS GEMM epilogue is always RowMajor, and ColumnMajor output is achieved by 
+  transposing the GEMM problem. Thus, ColumnMajor output layout for SYMM/HEMM requires:
+   - Transposing `matrix A` and `matrix B` layouts
+   - Swapping problem size m and n values
+   - Swapping LeftSide and RightSide mode
+  
+  RowMajor output:    D = matrix A x matrix B
+  ColumnMajor output: D = matrix A x matrix B -> Transpose (D) = Transpose(matrix B) x Transpose(matrix A)
+
+  {RowMajor, ColumnMajor} x Side Mode {LeftSide, RightSide} 4 cases:
+    1.  LeftSide mode and RowMajor output (default template)
+    2.  LeftSide mode and ColumnMajor output 
+    3.  RightSide mode and RowMajor output
+    4.  RightSide mode and ColumnMajor output
+  
+  Mapping ColumnMajor output layout cases 2 and 4 to RowMajor efficient epilogue implementation:
+  
+  Case 2 -> Case 3:
+      D_col = matrix A x matrix B (LeftSide mode) 
+   => Transpose(D_col) = Transpose(matrix B) x Transpose(matrix A) (RightSide mode)
+
+  swap pointers for `A` and `B` call GEMM mainloop with RowMajor efficient-epilogue
+
+  Case 4 -> Case 1:
+      D_col = matrix B x matrix A (RightSide mode) 
+   => Transpose(D_col) = Transpose(matrix A) x Transpose(matrix B) (LeftSide mode)
+
+   call GEMM mainloop for with RowMajor efficient-epilogue
+********************************************************************************************************/
+
+/// Parital specialization for column-major output exchanges problem size and operand.
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Side Mode for A (kLeft or kRight)
+    SideMode SideModeA,
+    /// Fill Mode for A (kLower or kUpper)
+    FillMode FillModeA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for.  This is the minimum SM that
+    /// supports the intended feature. The device kernel can be built
+    /// targeting any SM larger than this number.
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB,
+    /// If true, kernel supports split-K with serial reduction
+    bool SplitKSerial,
+    /// Operation performed by Symm update kernel
+    typename Operator_,
+    /// Blas3 computation mode (symmetric/hermitian)
+    BlasMode BlasMode_
+    >
+class Symm<ElementA_, LayoutA_, SideModeA, FillModeA, ElementB_, LayoutB_, ElementC_,
+           layout::ColumnMajor,  // partially specialized on LayoutC
+           ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_,
+           WarpShape_, InstructionShape_, EpilogueOutputOp_,
+           ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB,
+           SplitKSerial, Operator_, BlasMode_> {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using ElementC = ElementC_;
+  using LayoutC = layout::ColumnMajor;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static SideMode const kSideModeA = SideModeA;
+  static FillMode const kFillModeA = FillModeA;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+  static bool const kSplitKSerial = SplitKSerial;
+  static BlasMode const kBlasMode = BlasMode_;
+  
+  /// Define the kernel
+  using UnderlyingOperator = typename cutlass::gemm::device::Symm<
+    ElementA,
+    typename layout::LayoutTranspose<LayoutA>::type,
+    InvertSideMode<kSideModeA>::mode,
+    InvertFillMode<kFillModeA>::mode,
+    ElementB,
+    typename layout::LayoutTranspose<LayoutB>::type, 
+    ElementC,
+    layout::RowMajor,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    kStages,
+    kAlignmentA,
+    kAlignmentB,
+    kSplitKSerial,
+    Operator,
+    kBlasMode
+  >;
+  
+
+  /// Argument structure
+  using Arguments = typename UnderlyingOperator::Arguments;
+  using SymmKernel = typename UnderlyingOperator::SymmKernel;
+
+private:
+
+  UnderlyingOperator underlying_operator_;
+
+public:
+
+  /// Constructs the Symm.
+  Symm() { }
+
+  /// Helper to construct a transposed equivalent for the underying SYMM operator
+  static Arguments to_underlying_arguments(Arguments const &args) {
+    return args.transposed_problem_size();
+  }
+
+  /// Determines whether the Symm can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    return UnderlyingOperator::can_implement(to_underlying_arguments(args));
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    
+    return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args));
+  }
+
+  /// Computes the grid shape
+  static dim3 get_grid_shape(Arguments const &args) { 
+    return UnderlyingOperator::get_grid_shape(to_underlying_arguments(args));
+  }
+
+  /// Computes the maximum number of active blocks per multiprocessor
+  static int maximum_active_blocks(int smem_capacity = -1) {
+    return UnderlyingOperator::maximum_active_blocks(smem_capacity);
+  }
+
+  /// Initializes Symm state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.initialize(to_underlying_arguments(args), workspace, stream);
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    return underlying_operator_.update(to_underlying_arguments(args), workspace);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace, stream);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace Symm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/device/trmm.h b/include/cutlass/gemm/device/trmm.h
new file mode 100644
index 00000000..28a18250
--- /dev/null
+++ b/include/cutlass/gemm/device/trmm.h
@@ -0,0 +1,758 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a TRMM kernel. Does not compute batching or support split-K.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/trmm_universal.h"
+
+#include "cutlass/gemm/kernel/default_trmm_universal.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/*! Trmm device-level operator. This is an interface to efficient CUTLASS TRMM kernels that may
+  be invoked from host code.
+
+  The contributions of this class are:
+    
+    1. At compile time, it maps data types and high-level structural parameters onto 
+       specific CUTLASS components.
+
+    2. At runtime, it maps logical arguments to TRMM problems to kernel parameters.
+
+    3. At runtime, it launches kernels on the device.
+
+  The intent is to provide a convenient mechanism for interacting with most plausible TRMM
+  configurations for each supported architecture. Consequently, not all parameters are exposed
+  to the top-level interface. Rather, sensible defaults at each level of the CUTLASS hierarchy
+  are selected to tradeoff simplicity of the interface with flexibility. We expect 
+  most configurations to be specified at this level. Applications with more exotic requirements 
+  may construct their kernels of interest using CUTLASS components at the threadblock, warp, 
+  and thread levels of abstraction.
+
+  CUTLASS exposes computations using the functor design pattern in which objects compose some
+  internal state with an overloaded function call operator. This enables decoupling of
+  initialization from execution, possibly reducing overhead during steady state phases of
+  application execution.
+
+  CUTLASS device-level operators expose an Arguments structure encompassing each logical
+  input to the computation. This is distinct from the kernel-level Params structure pattern
+  which contains application-specific precomputed state needed by the device code.
+
+  Example of a CUTLASS TRMM operator implementing the functionality of cuBLAS's STRMM NN
+  is as follows:
+
+    //
+    // Instantiate the CUTLASS TRMM operator.
+    //
+
+    cutlass::gemm::device::trmm<
+      float,
+      cutlass::layout::ColumnMajor,
+      cutlass::SideMode::kLeft,
+      cutlass::FillMode::kLower,
+      cutlass::DiagType::kNonUnit,
+      float,
+      cutlass::layout::ColumnMajor,
+      float,
+      cutlass::layout::ColumnMajor,
+    > trmm_op;
+
+    //
+    // Launch the TRMM operation on the device
+    //
+
+    cutlass::Status status = trmm_op({
+      cutlass::gemm::GemmUniversalMode,   // Trmm Problem Mode
+      {m, n, m/n},                        // GemmCoord problem_size (k is based on left- or right-side mode)
+      batch_count,
+      {alpha}                            // EpilogueOutputOp::Params epilogue_op_params
+      void const * ptr_A,
+      void const * ptr_B,
+      void const * ptr_C,
+      int64_t batch_stride_A,
+      int64_t batch_stride_B,
+      int64_t batch_stride_C,
+      int lda,
+      int ldb,
+      int ldc
+    });
+
+  A simplified view of the template is listed below.
+
+    template <
+      /// Element type for A matrix operand
+      typename ElementA,
+      
+      /// Layout type for A matrix operand
+      typename LayoutA,
+      
+      /// Side Mode for A (kLeft or kRight)
+      SideMode SideModeA,
+
+      /// Fill Mode for A (kLower or kUpper)
+      FillMode FillModeA,
+
+      /// DiagType for A (kNonUnit or kUnit)
+      DiagType DiagTypeA,
+
+      /// Element type for B matrix operand
+      typename ElementB,
+      
+      /// Layout type for B matrix operand
+      typename LayoutB,
+      
+      /// Element type for C and D matrix operands
+      typename ElementC,
+      
+      /// Layout type for C and D matrix operands
+      typename LayoutC,
+      
+      /// Element type for internal accumulation
+      typename ElementAccumulator,
+
+      /// Operator class tag
+      typename OperatorClass,
+      
+      /// Tag indicating architecture to tune for.  This is the minimum SM that
+      /// supports the intended feature. The device kernel can be built
+      /// targeting any SM larger than this number.
+      typename ArchTag,
+      
+      /// Threadblock-level tile size (concept: GemmShape)
+      typename ThreadblockShape,
+      
+      /// Warp-level tile size (concept: GemmShape)
+      typename WarpShape,
+      
+      /// Warp-level tile size (concept: GemmShape)
+      typename InstructionShape,
+      
+      /// Epilogue output operator
+      typename EpilogueOutputOp,
+      
+      /// Threadblock-level swizzling operator
+      typename ThreadblockSwizzle,
+      
+      /// Number of stages used in the pipelined mainloop
+      int Stages,
+
+      /// Access granularity of A matrix in units of elements
+      int AlignmentA,
+
+      /// Access granularity of B matrix in units of elements
+      int AlignmentB,
+
+      /// If true, kernel supports split-K with serial reduction
+      bool SplitKSerial,
+
+      /// Operation performed by TRMM
+      typename Operator,
+
+      /// Complex elementwise transformation on A operand
+      ComplexTransform TransformA
+    >
+    class Trmm;
+*/
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Side Mode for A 
+    SideMode SideModeA,
+    /// Fill Mode for A
+    FillMode FillModeA,
+    /// DiagType for A
+    DiagType DiagTypeA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassTensorOp,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_ = arch::Sm80,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_ = epilogue::thread::LinearCombination<
+      ElementC_,
+      128 / sizeof_bits<ElementC_>::value,
+      ElementAccumulator_,
+      ElementAccumulator_,
+      epilogue::thread::ScaleType::OnlyAlphaScaling
+    >,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ = threadblock::GemmIdentityThreadblockSwizzle<>,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentB,
+    /// If true, kernel supports split-K with serial reduction
+    bool SplitKSerial = false,
+    /// Operation performed by TRMM
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::Operator,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA = ComplexTransform::kNone>
+class Trmm {
+ public:
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementAKernel = typename platform::conditional<(SideModeA == SideMode::kRight), ElementB_, ElementA_>::type;
+  using LayoutAKernel = typename platform::conditional<(SideModeA == SideMode::kRight), LayoutB_, LayoutA_>::type;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using ElementBKernel = typename platform::conditional<(SideModeA == SideMode::kRight), ElementA_, ElementB_>::type;
+  using LayoutBKernel = typename platform::conditional<(SideModeA == SideMode::kRight), LayoutA_, LayoutB_>::type;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static SideMode const kSideMode = SideModeA;
+  static FillMode const kFillMode = FillModeA;
+  static DiagType const kDiagType = DiagTypeA;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentAKernel = (SideModeA == SideMode::kRight) ? AlignmentB : AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentBKernel = (SideModeA == SideMode::kRight) ? AlignmentA : AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+  static bool const kSplitKSerial = SplitKSerial;
+  // Complex Transform don't appply to B
+  static ComplexTransform const kTransformA = TransformA; 
+  static ComplexTransform const kTransformB = ComplexTransform::kNone; 
+  static ComplexTransform const kTransformAKernel = (SideModeA == SideMode::kRight) ? 
+                                              ComplexTransform::kNone : TransformA;
+  static ComplexTransform const kTransformBKernel = (SideModeA == SideMode::kRight) ? 
+                                              TransformA : ComplexTransform::kNone;
+
+  /// Define the kernel
+  using TrmmKernel = typename kernel::DefaultTrmmUniversal<
+    ElementAKernel,
+    LayoutAKernel,
+    kTransformAKernel,
+    kAlignmentAKernel,
+    ElementBKernel,
+    LayoutBKernel,
+    kTransformBKernel,
+    kAlignmentBKernel,
+    kSideMode,
+    kFillMode,
+    kDiagType,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    kStages,
+    kSplitKSerial,
+    Operator
+  >::TrmmKernel;
+  
+  using Arguments = typename TrmmKernel::Arguments;
+
+private:
+
+  /// Kernel parameters object
+  typename TrmmKernel::Params params_;
+public:
+
+  /// Constructs the TRMM.
+  Trmm() { }
+
+  /// Determines whether the TRMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    if (!kSplitKSerial && args.batch_count > 1) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    Status status = TrmmKernel::can_implement(args);
+   
+    if (SideModeA == SideMode::kInvalid) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (FillModeA == FillMode::kInvalid) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (DiagTypeA == DiagType::kInvalid) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    
+    size_t bytes = 0;
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord tiled_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size, 
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+      args.batch_count);
+    
+    if (kSplitKSerial && args.batch_count > 1) {
+
+      bytes += sizeof(int) * size_t(tiled_shape.m()) * size_t(tiled_shape.n());
+    }
+
+    return bytes;
+  }
+
+  /// Initializes TRMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+ 
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size, 
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+      args.batch_count);
+
+    if (kSplitKSerial) {
+      if (args.batch_count > 1) {
+        if (!workspace) {
+          return Status::kErrorWorkspaceNull;
+        }
+
+        size_t bytes = get_workspace_size(args);
+      
+        cudaError_t result = cudaMemsetAsync(workspace, 0, bytes, stream);
+
+        if (result != cudaSuccess) {
+          return Status::kErrorInternal;
+        }
+      }
+    }
+    else {
+
+      if (args.batch_count > 1) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+    
+    int gemm_k_size = args.problem_size.k();
+
+   // Swapping argument for A and B, if A was on the right side (problem size doesn't need to change here).
+    if (kSideMode == SideMode::kRight) {
+      // Initialize the Params structure
+      params_ = typename TrmmKernel::Params{
+        args.swapped_matrices(),
+        grid_tiled_shape,
+        gemm_k_size,
+        static_cast<int *>(workspace)
+      };
+
+      return Status::kSuccess;
+    }
+
+    // Initialize the Params structure
+    params_ = typename TrmmKernel::Params{
+      args,
+      grid_tiled_shape,
+      gemm_k_size,
+      static_cast<int *>(workspace)
+    };
+    
+    return Status::kSuccess;
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+    
+    if (kSplitKSerial && args.batch_count > 1) {  
+      if (!workspace) {
+        return Status::kErrorWorkspaceNull;
+      }
+    }
+
+    size_t workspace_bytes = get_workspace_size(args);
+
+    if (workspace_bytes && !workspace) {
+      return Status::kErrorWorkspaceNull;
+    }
+
+    params_.update(args, workspace);
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
+    dim3 block(TrmmKernel::kThreadCount, 1, 1);
+
+    int smem_size = int(sizeof(typename TrmmKernel::SharedStorage));
+    
+    if (smem_size >= (48 << 10)) {
+      cudaError_t result = cudaFuncSetAttribute(Kernel<TrmmKernel>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    smem_size);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+
+    cutlass::Kernel<TrmmKernel><<<grid, block, smem_size, stream>>>(params_);
+
+    cudaError_t result = cudaGetLastError();
+
+    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+/********************************************************************************************************
+  TRMM has 4 combinations based on Layouts {RowMajor, ColumnMajor} x Side mode {LeftSide, RightSide}
+  In templates and arguments to cutlass kernel, `matrix A` is always triangular, and `matrix B` is rectangular. 
+  (adhering to the cuBLAS convention)
+
+For the mainloop and trmm kernel, `A` and `B` points to left-side and right-side matrices, respectively.
+  
+  Thus, for LeftSide mode `A` and `B` points to `matrix A` and `matrix B`, respectively. While for 
+  the RightSide mode `A` and `B` points to `matrix B` and `matrix A`, respectively. 
+  
+  Additionally, CUTLASS GEMM epilogue is always RowMajor, and ColumnMajor output is achieved by 
+  transposing the GEMM problem. Thus, ColumnMajor output layout for TRMM requires:
+   - Transposing `matrix A` and `matrix B` layouts
+   - Swapping problem size m and n values
+   - Swapping LeftSide and RightSide mode
+  
+  RowMajor output:    D = matrix A x matrix B
+  ColumnMajor output: D = matrix A x matrix B -> Transpose (D) = Transpose(matrix B) x Transpose(matrix A)
+
+  {RowMajor, ColumnMajor} x Side Mode {LeftSide, RightSide} 4 cases:
+    1.  LeftSide mode and RowMajor output (default template)
+    2.  LeftSide mode and ColumnMajor output 
+    3.  RightSide mode and RowMajor output
+    4.  RightSide mode and ColumnMajor output
+  
+  Mapping ColumnMajor output layout cases 2 and 4 to RowMajor efficient epilogue implementation:
+  
+  Case 2 -> Case 3:
+      D_col = matrix A x matrix B (LeftSide mode) 
+   => Transpose(D_col) = Transpose(matrix B) x Transpose(matrix A) (RightSide mode)
+
+  swap pointers for `A` and `B` call GEMM mainloop with RowMajor efficient-epilogue
+
+  Case 4 -> Case 1:
+      D_col = matrix B x matrix A (RightSide mode) 
+   => Transpose(D_col) = Transpose(matrix A) x Transpose(matrix B) (LeftSide mode)
+
+   call GEMM mainloop for with RowMajor efficient-epilogue
+********************************************************************************************************/
+
+/// Parital specialization for column-major output exchanges problem size and operand.
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Side Mode for A 
+    SideMode SideModeA,
+    /// Fill Mode for A
+    FillMode FillModeA,
+    /// DiagType for A
+    DiagType DiagTypeA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB,
+    /// If true, kernel supports split-K as a serial reduction
+    bool SplitKSerial,
+    /// Operation performed by TRMM
+    typename Operator_,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA>
+class Trmm<ElementA_, LayoutA_, SideModeA, FillModeA, DiagTypeA,
+           ElementB_, LayoutB_, ElementC_,
+           layout::ColumnMajor,  // partially specialized on LayoutC
+           ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_,
+           WarpShape_, InstructionShape_, EpilogueOutputOp_,
+           ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial,
+           Operator_, TransformA> {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_; 
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using ElementC = ElementC_;
+  using LayoutC = layout::ColumnMajor;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static SideMode const kSideMode = SideModeA;
+  static FillMode const kFillMode = FillModeA;
+  static DiagType const kDiagType = DiagTypeA;
+  // Changing SideMode as we change the layout
+  static SideMode const kSideModeT = (SideModeA == SideMode::kLeft) ?
+                                      SideMode::kRight : SideMode::kLeft;
+  // Changing FillMode as we change the layout
+  static FillMode const kFillModeT = (FillModeA == FillMode::kLower) ? 
+                                      FillMode::kUpper : FillMode::kLower;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static ComplexTransform const kTransformA = TransformA;
+  // Complex Transform don't appply to B
+  static ComplexTransform const kTransformB = ComplexTransform::kNone; 
+  static bool const kSplitKSerial = SplitKSerial;
+
+  using UnderlyingOperator = Trmm<
+    ElementA,
+    typename layout::LayoutTranspose<LayoutA>::type,
+    kSideModeT,
+    kFillModeT,
+    kDiagType,
+    ElementB,
+    typename layout::LayoutTranspose<LayoutB>::type, 
+    ElementC,
+    layout::RowMajor,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    kStages,
+    kAlignmentA,
+    kAlignmentB,
+    kSplitKSerial,
+    Operator,
+    TransformA
+  >;
+
+  using Arguments = typename UnderlyingOperator::Arguments;
+  using TrmmKernel = typename UnderlyingOperator::TrmmKernel;
+  static int const kAlignmentC = UnderlyingOperator::kAlignmentC;
+
+private:
+
+  UnderlyingOperator underlying_operator_;
+
+public:
+
+  /// Constructs the TRMM.
+  Trmm() { }
+
+  /// Helper to construct a transposed equivalent for the underying TRMM operator which is identical
+  static Arguments to_underlying_arguments(Arguments const &args) {
+    return args.transposed_problem_size();
+  }
+
+  /// Determines whether the TRMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    return UnderlyingOperator::can_implement(to_underlying_arguments(args));
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    
+    return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args));
+  }
+
+  /// Initializes TRMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.initialize(to_underlying_arguments(args), workspace, stream);
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    return underlying_operator_.update(to_underlying_arguments(args), workspace);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+   
+    Status status = initialize(args, workspace, stream);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/gemm.h b/include/cutlass/gemm/gemm.h
index 4898b0bf..e2db4c95 100644
--- a/include/cutlass/gemm/gemm.h
+++ b/include/cutlass/gemm/gemm.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -66,7 +72,6 @@ struct GemmShape {
 
   static int const kCount = kMNK;
 
-
   //
   // Static member functions
   //
diff --git a/include/cutlass/gemm/kernel/default_gemm.h b/include/cutlass/gemm/kernel/default_gemm.h
index ea1e9dee..3b13bf5a 100644
--- a/include/cutlass/gemm/kernel/default_gemm.h
+++ b/include/cutlass/gemm/kernel/default_gemm.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -63,7 +69,6 @@
 #include "cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h"
 #endif //CUTLASS_ARCH_WMMA_ENABLED
 
-
 ////////////////////////////////////////////////////////////////////////////////
 
 namespace cutlass {
@@ -113,7 +118,15 @@ template <
     /// Operation performed by GEMM
     typename Operator,
     /// Use zfill or predicate for out-of-bound cp.async
-    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone
+    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone,
+    /// Gather operand A by using an index array
+    bool GatherA = false,
+    /// Gather operand B by using an index array
+    bool GatherB = false,
+    /// Scatter result D by using an index array
+    bool ScatterD = false,
+    ///
+    typename Enable = void
 >
 struct DefaultGemm;
 
@@ -158,23 +171,30 @@ template <
     /// Operation performed by GEMM
     typename Operator,
     /// Use zfill or predicate for out-of-bound cp.async
-    SharedMemoryClearOption SharedMemoryClear>
+    SharedMemoryClearOption SharedMemoryClear,
+    /// Gather operand A by using an index array
+    bool GatherA,
+    /// Gather operand B by using an index array
+    bool GatherB,
+    /// Scatter result D by using an index array
+    bool ScatterD
+>
 struct DefaultGemm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementC,
                    LayoutC, ElementAccumulator, arch::OpClassTensorOp,
                    arch::Sm80, ThreadblockShape, WarpShape, InstructionShape,
                    EpilogueOutputOp, ThreadblockSwizzle, Stages, SplitKSerial,
-                   Operator, SharedMemoryClear> {
+                   Operator, SharedMemoryClear, GatherA, GatherB, ScatterD> {
 
   static_assert(platform::is_same<LayoutC, layout::RowMajor>::value
              || platform::is_same<LayoutC, layout::AffineRankN<2>>::value,
-             "simt epilogue must be row major");
+             "Epilogue in the kernel level must be row major");
 
   /// Define the threadblock-scoped matrix multiply-accumulate
   using Mma = typename cutlass::gemm::threadblock::DefaultMma<
       ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
       ElementAccumulator, LayoutC, arch::OpClassTensorOp, arch::Sm80,
       ThreadblockShape, WarpShape, InstructionShape, Stages,
-      Operator, false, SharedMemoryClear>::ThreadblockMma;
+      Operator, false, SharedMemoryClear, GatherA, GatherB>::ThreadblockMma;
 
   static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
 
@@ -182,7 +202,7 @@ struct DefaultGemm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignment
   using RegularEpilogue =
       typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
           ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
-          EpilogueOutputOp::kCount>::Epilogue;
+          EpilogueOutputOp::kCount, ScatterD>::Epilogue;
 
   using Affine2Epilogue =
       typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOpAffineRankN<
@@ -232,7 +252,13 @@ template <
   /// Operation performed by GEMM
   typename Operator,
   /// Use zfill or predicate for out-of-bound cp.async
-  SharedMemoryClearOption SharedMemoryClear
+  SharedMemoryClearOption SharedMemoryClear,
+  /// Gather operand A by using an index array
+  bool GatherA,
+  /// Gather operand B by using an index array
+  bool GatherB,
+  /// Scatter result D by using an index array
+  bool ScatterD
 >
 struct DefaultGemm<
   ElementA, LayoutA, kAlignmentA,
@@ -249,7 +275,10 @@ struct DefaultGemm<
   2,
   SplitKSerial,
   Operator,
-  SharedMemoryClear
+  SharedMemoryClear,
+  GatherA,
+  GatherB,
+  ScatterD
 > {
 
   /// Define the threadblock-scoped matrix multiply-accumulate
@@ -268,7 +297,11 @@ struct DefaultGemm<
     WarpShape,
     InstructionShape,
     2,
-    Operator
+    Operator,
+    false,
+    SharedMemoryClear,
+    GatherA,
+    GatherB
   >::ThreadblockMma;
 
   static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
@@ -279,7 +312,8 @@ struct DefaultGemm<
     typename Mma::Operator,
     kPartitionsK,
     EpilogueOutputOp,
-    EpilogueOutputOp::kCount
+    EpilogueOutputOp::kCount,
+    ScatterD
   >::Epilogue;
 
   /// Define the kernel-level GEMM operator.
@@ -327,7 +361,7 @@ struct DefaultGemm<
     layout::ColumnMajorInterleaved<InterleavedK>, int32_t,
     arch::OpClassTensorOp, arch::Sm80, ThreadblockShape, WarpShape,
     InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages,
-    SplitKSerial, Operator, SharedMemoryClear> {
+    SplitKSerial, Operator, SharedMemoryClear, false, false, false> {
 
   using LayoutA = layout::ColumnMajorInterleaved<InterleavedK>;
   using LayoutB = layout::RowMajorInterleaved<InterleavedK>;
@@ -393,7 +427,8 @@ struct DefaultGemm<ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
                    ElementC, layout::ColumnMajorInterleaved<InterleavedK>,
                    int32_t, arch::OpClassTensorOp, arch::Sm75, ThreadblockShape,
                    WarpShape, InstructionShape, EpilogueOutputOp,
-                   ThreadblockSwizzle, 2, SplitKSerial, Operator, SharedMemoryClear> {
+                   ThreadblockSwizzle, 2, SplitKSerial, Operator, SharedMemoryClear,
+                   false, false, false> {
 
   using LayoutA = layout::ColumnMajorInterleaved<InterleavedK>;
   using LayoutB = layout::RowMajorInterleaved<InterleavedK>;
@@ -452,7 +487,13 @@ template <
   /// Operation performed by GEMM
   typename Operator,
   /// Use zfill or predicate for out-of-bound cp.async
-  SharedMemoryClearOption SharedMemoryClear
+  SharedMemoryClearOption SharedMemoryClear,
+  /// Gather operand A by using an index array
+  bool GatherA,
+  /// Gather operand B by using an index array
+  bool GatherB,
+  /// Scatter result D by using an index array
+  bool ScatterD
 >
 struct DefaultGemm<
   ElementA, LayoutA, kAlignmentA,
@@ -469,7 +510,10 @@ struct DefaultGemm<
   2,
   SplitKSerial,
   Operator,
-  SharedMemoryClear
+  SharedMemoryClear,
+  GatherA,
+  GatherB,
+  ScatterD
 > {
 
   /// Define the threadblock-scoped matrix multiply-accumulate
@@ -488,7 +532,11 @@ struct DefaultGemm<
     WarpShape,
     GemmShape<8, 8, 4>,
     2,
-    Operator
+    Operator,
+    false,
+    SharedMemoryClear,
+    GatherA,
+    GatherB
   >::ThreadblockMma;
 
   static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
@@ -499,7 +547,8 @@ struct DefaultGemm<
     typename Mma::Operator,
     kPartitionsK,
     EpilogueOutputOp,
-    EpilogueOutputOp::kCount
+    EpilogueOutputOp::kCount,
+    ScatterD
   >::Epilogue;
 
   /// Define the kernel-level GEMM operator.
@@ -543,7 +592,13 @@ template <
     /// Operation performed by GEMM
     typename Operator,
     /// Use zfill or predicate for out-of-bound cp.async
-    SharedMemoryClearOption SharedMemoryClear
+    SharedMemoryClearOption SharedMemoryClear,
+    /// Gather operand A by using an index array
+    bool GatherA,
+    /// Gather operand B by using an index array
+    bool GatherB,
+    /// Scatter result D by using an index array
+    bool ScatterD
   >
 struct DefaultGemm<
     ElementA,
@@ -565,11 +620,15 @@ struct DefaultGemm<
     2,
     SplitKSerial,
     Operator,
-    SharedMemoryClear> {
+    SharedMemoryClear,
+    GatherA,
+    GatherB,
+    ScatterD,
+    typename platform::enable_if< ! platform::is_same<ArchTag, arch::Sm80>::value >::type > {
 
   static_assert(platform::is_same<LayoutC, layout::RowMajor>::value
              || platform::is_same<LayoutC, layout::AffineRankN<2>>::value,
-             "simt epilogue must be row major");
+             "Epilogue in the kernel level must be row major");
 
   /// Define the threadblock-scoped matrix multiply-accumulate
   using Mma = typename cutlass::gemm::threadblock::DefaultMma<
@@ -587,7 +646,11 @@ struct DefaultGemm<
       WarpShape,
       GemmShape<1, 1, 1>,
       2,
-      Operator>::ThreadblockMma;
+      Operator,
+      false,
+      SharedMemoryClear,
+      GatherA,
+      GatherB>::ThreadblockMma;
 
   static int const kEpilogueElementsPerAccess = EpilogueOutputOp::kCount;
   static_assert(kEpilogueElementsPerAccess == 1, "simt epilogue must operate on scalars");
@@ -597,7 +660,8 @@ struct DefaultGemm<
       ThreadblockShape,
       typename Mma::Operator,
       EpilogueOutputOp,
-      kEpilogueElementsPerAccess
+      kEpilogueElementsPerAccess,
+      ScatterD
       >::Epilogue;
 
   using Affine2Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueSimtAffineRankN<
@@ -653,7 +717,13 @@ template <
     /// Operation performed by GEMM
     typename Operator,
     /// Use zfill or predicate for out-of-bound cp.async
-    SharedMemoryClearOption SharedMemoryClear
+    SharedMemoryClearOption SharedMemoryClear,
+    /// Gather operand A by using an index array
+    bool GatherA,
+    /// Gather operand B by using an index array
+    bool GatherB,
+    /// Scatter result D by using an index array
+    bool ScatterD
 >
 struct DefaultGemm<ElementA,
                    LayoutA,
@@ -674,18 +744,21 @@ struct DefaultGemm<ElementA,
                    Stages,
                    SplitKSerial,
                    Operator,
-                   SharedMemoryClear> {
+                   SharedMemoryClear,
+                   GatherA,
+                   GatherB,
+                   ScatterD> {
 
   static_assert(platform::is_same<LayoutC, layout::RowMajor>::value
              || platform::is_same<LayoutC, layout::AffineRankN<2>>::value,
-             "simt epilogue must be row major");
+             "Epilogue in the kernel level must be row major");
 
   /// Define the threadblock-scoped matrix multiply-accumulate
   using Mma = typename cutlass::gemm::threadblock::DefaultMma<
       ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
       ElementAccumulator, LayoutC, arch::OpClassSimt, arch::Sm80,
       ThreadblockShape, WarpShape, GemmShape<1, 1, 1>, Stages,
-      Operator, false, SharedMemoryClear>::ThreadblockMma;
+      Operator, false, SharedMemoryClear, GatherA, GatherB>::ThreadblockMma;
 
   static int const kEpilogueElementsPerAccess = EpilogueOutputOp::kCount;
   static_assert(kEpilogueElementsPerAccess == 1, "simt epilogue must operate on scalars");
@@ -695,7 +768,8 @@ struct DefaultGemm<ElementA,
       ThreadblockShape,
       typename Mma::Operator,
       EpilogueOutputOp,
-      kEpilogueElementsPerAccess
+      kEpilogueElementsPerAccess,
+      ScatterD
       >::Epilogue;
 
   using Affine2Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueSimtAffineRankN<
@@ -754,7 +828,7 @@ struct DefaultGemm<int8_t, LayoutA, kAlignmentA, int8_t, LayoutB, kAlignmentB,
                    ElementC, LayoutC, ElementAccumulator, arch::OpClassSimt,
                    ArchTag, ThreadblockShape, WarpShape, GemmShape<1, 1, 4>,
                    EpilogueOutputOp, ThreadblockSwizzle, 2, SplitKSerial,
-                   Operator, SharedMemoryClear> {
+                   Operator, SharedMemoryClear, false, false, false> {
   using InstructionShape = GemmShape<1, 1, 4>;
   using ElementA = int8_t;
   using ElementB = int8_t;
@@ -850,7 +924,11 @@ struct DefaultGemm<
   Stages, 
   SplitKSerial,
   Operator,
-  SharedMemoryClear> {
+  SharedMemoryClear,
+  false,
+  false,
+  false
+> {
   /// Define the threadblock-scoped matrix multiply-accumulate
   using Mma = typename cutlass::gemm::threadblock::DefaultMma<
       ElementA, LayoutA, kAlignmentA,
diff --git a/include/cutlass/gemm/kernel/default_gemm_complex.h b/include/cutlass/gemm/kernel/default_gemm_complex.h
index 04cc2579..1fad1412 100644
--- a/include/cutlass/gemm/kernel/default_gemm_complex.h
+++ b/include/cutlass/gemm/kernel/default_gemm_complex.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/kernel/default_gemm_grouped.h b/include/cutlass/gemm/kernel/default_gemm_grouped.h
index 664583f9..d6cdd1dd 100644
--- a/include/cutlass/gemm/kernel/default_gemm_grouped.h
+++ b/include/cutlass/gemm/kernel/default_gemm_grouped.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/kernel/default_gemm_planar_complex_universal.h b/include/cutlass/gemm/kernel/default_gemm_planar_complex_universal.h
index 2abd7c41..69bd9077 100644
--- a/include/cutlass/gemm/kernel/default_gemm_planar_complex_universal.h
+++ b/include/cutlass/gemm/kernel/default_gemm_planar_complex_universal.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -171,7 +177,7 @@ struct DefaultGemmPlanarComplexUniversal<
   ThreadblockSwizzle,
   Stages,
   Operator,
-  typename std::enable_if<(Stages <= 2)>::type 
+  typename platform::enable_if<(Stages <= 2)>::type 
 > {
 
   /// Define planar complex valued variants instead
@@ -287,7 +293,7 @@ struct DefaultGemmPlanarComplexUniversal<
   ThreadblockSwizzle,
   Stages,
   Operator,
-  typename std::enable_if<(Stages > 2)>::type 
+  typename platform::enable_if<(Stages > 2)>::type 
 > {
 
   /// Define planar complex valued variants instead
diff --git a/include/cutlass/gemm/kernel/default_gemm_sparse.h b/include/cutlass/gemm/kernel/default_gemm_sparse.h
index 115d0572..aa0e8ed3 100644
--- a/include/cutlass/gemm/kernel/default_gemm_sparse.h
+++ b/include/cutlass/gemm/kernel/default_gemm_sparse.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/kernel/default_gemm_splitk_parallel.h b/include/cutlass/gemm/kernel/default_gemm_splitk_parallel.h
index 3e68acc6..375e2767 100644
--- a/include/cutlass/gemm/kernel/default_gemm_splitk_parallel.h
+++ b/include/cutlass/gemm/kernel/default_gemm_splitk_parallel.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/kernel/default_gemm_universal.h b/include/cutlass/gemm/kernel/default_gemm_universal.h
index 7b799f3d..45b2c769 100644
--- a/include/cutlass/gemm/kernel/default_gemm_universal.h
+++ b/include/cutlass/gemm/kernel/default_gemm_universal.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -97,6 +103,12 @@ template <
     typename Operator,
     /// Use zfill or predicate for out-of-bound cp.async
     SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone,
+    /// Gather operand A by using an index array
+    bool GatherA = false,
+    /// Gather operand B by using an index array
+    bool GatherB = false,
+    /// Scatter result D by using an index array
+    bool ScatterD = false,
     ///
     typename Enable = void
     >
@@ -145,7 +157,13 @@ template <
     /// Operation performed by GEMM
     typename Operator,
     /// Use zfill or predicate for out-of-bound cp.async
-    SharedMemoryClearOption SharedMemoryClear
+    SharedMemoryClearOption SharedMemoryClear,
+    /// Gather operand A by using an index array
+    bool GatherA,
+    /// Gather operand B by using an index array
+    bool GatherB,
+    /// Scatter result D by using an index array
+    bool ScatterD
 >
 struct DefaultGemmUniversal<
   ElementA,
@@ -169,7 +187,10 @@ struct DefaultGemmUniversal<
   Stages,
   Operator,
   SharedMemoryClear,
-  typename std::enable_if< ! cutlass::is_complex<ElementAccumulator>::value>::type
+  GatherA,
+  GatherB,
+  ScatterD,
+  typename platform::enable_if< ! cutlass::is_complex<ElementAccumulator>::value>::type
 > {
 
   using DefaultGemmKernel = typename kernel::DefaultGemm<
@@ -192,7 +213,10 @@ struct DefaultGemmUniversal<
     Stages,
     true,
     Operator,
-    SharedMemoryClear
+    SharedMemoryClear,
+    GatherA,
+    GatherB,
+    ScatterD
   >::GemmKernel;
 
     /// Define the kernel in terms of the default kernel
@@ -275,7 +299,10 @@ struct DefaultGemmUniversal<
   Stages,
   Operator,
   SharedMemoryClear,
-  typename std::enable_if<cutlass::is_complex<ElementAccumulator>::value>::type
+  false,
+  false,
+  false,
+  typename platform::enable_if<cutlass::is_complex<ElementAccumulator>::value>::type
 > {
 
   using DefaultGemmKernel = typename kernel::DefaultGemmComplex<
diff --git a/include/cutlass/gemm/kernel/default_gemm_with_broadcast.h b/include/cutlass/gemm/kernel/default_gemm_with_broadcast.h
index bd90b3ae..8cf0f7bb 100644
--- a/include/cutlass/gemm/kernel/default_gemm_with_broadcast.h
+++ b/include/cutlass/gemm/kernel/default_gemm_with_broadcast.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *   * Redistributions of source code must retain the above copyright notice, this list of
- *     conditions and the following disclaimer.
- *   * Redistributions in binary form must reproduce the above copyright notice, this list of
- *     conditions and the following disclaimer in the documentation and/or other materials
- *     provided with the distribution.
- *   * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *     to endorse or promote products derived from this software without specific prior written
- *     permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/kernel/default_gemm_with_k_reduction.h b/include/cutlass/gemm/kernel/default_gemm_with_k_reduction.h
index e3d8c5a4..ddb78b2c 100644
--- a/include/cutlass/gemm/kernel/default_gemm_with_k_reduction.h
+++ b/include/cutlass/gemm/kernel/default_gemm_with_k_reduction.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/kernel/default_gemm_with_reduction.h b/include/cutlass/gemm/kernel/default_gemm_with_reduction.h
index d6693ed6..a808a7e0 100644
--- a/include/cutlass/gemm/kernel/default_gemm_with_reduction.h
+++ b/include/cutlass/gemm/kernel/default_gemm_with_reduction.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *   * Redistributions of source code must retain the above copyright notice, this list of
- *     conditions and the following disclaimer.
- *   * Redistributions in binary form must reproduce the above copyright notice, this list of
- *     conditions and the following disclaimer in the documentation and/or other materials
- *     provided with the distribution.
- *   * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *     to endorse or promote products derived from this software without specific prior written
- *     permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/kernel/default_gemv.h b/include/cutlass/gemm/kernel/default_gemv.h
index 48c24e6e..b9d4bb1a 100755
--- a/include/cutlass/gemm/kernel/default_gemv.h
+++ b/include/cutlass/gemm/kernel/default_gemv.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/kernel/default_rank_2k.h b/include/cutlass/gemm/kernel/default_rank_2k.h
new file mode 100644
index 00000000..d421d85d
--- /dev/null
+++ b/include/cutlass/gemm/kernel/default_rank_2k.h
@@ -0,0 +1,207 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+      Default kernel-level Rank2K definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/arch/wmma.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/rank_2k_universal.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op_blas3.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+#include "cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h"
+#endif //CUTLASS_ARCH_WMMA_ENABLED
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Blas3 computation mode
+    BlasMode BlasMode_ = BlasMode::kSymmetric>
+struct DefaultRank2K;
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultRank2K<
+                    ElementA, LayoutA, kAlignmentA, 
+                    ElementB, LayoutB, kAlignmentB, 
+                    ElementC,layout::RowMajor, FillModeC, 
+                    ElementAccumulator, arch::OpClassTensorOp, arch::Sm80, 
+                    ThreadblockShape, WarpShape, InstructionShape,
+                    EpilogueOutputOp, ThreadblockSwizzle, Stages, SplitKSerial,
+                    Operator> {
+  /// Define the threadblock-scoped matrix multiply-accumulate (A x BT)
+  using Mma1 = typename cutlass::gemm::threadblock::DefaultMma<
+      ElementA, LayoutA, 
+      kAlignmentA, 
+      ElementB, typename layout::LayoutTranspose<LayoutB>::type, 
+      kAlignmentB,
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80,
+      ThreadblockShape, WarpShape, InstructionShape, Stages,
+      Operator>::ThreadblockMma;
+  
+  /// Define the threadblock-scoped matrix multiply-accumulate (B x AT)
+  using Mma2 = typename cutlass::gemm::threadblock::DefaultMma<
+      ElementB, LayoutB, 
+      kAlignmentB, 
+      ElementA, typename layout::LayoutTranspose<LayoutA>::type, 
+      kAlignmentA,
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80,
+      ThreadblockShape, WarpShape, InstructionShape, Stages,
+      Operator>::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOpBlas3<
+          ThreadblockShape, typename Mma1::Operator, kPartitionsK, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, BlasMode::kSymmetric>::Epilogue;
+
+  /// Define the kernel-level Rank2K operator.
+  using Rank2Kkernel = kernel::Rank2KUniversal<Mma1, Mma2, Epilogue, ThreadblockSwizzle, FillModeC, BlasMode::kSymmetric>;
+};
+////////////////////////////////////////////////////////////////////////////////
+
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/include/cutlass/gemm/kernel/default_rank_2k_complex.h b/include/cutlass/gemm/kernel/default_rank_2k_complex.h
new file mode 100644
index 00000000..8255cef2
--- /dev/null
+++ b/include/cutlass/gemm/kernel/default_rank_2k_complex.h
@@ -0,0 +1,334 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+      Default kernel-level Rank2K definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/arch/wmma.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/rank_2k_universal.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/default_multistage_mma_complex.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op_blas3.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+#include "cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h"
+#endif //CUTLASS_ARCH_WMMA_ENABLED
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Blas3 computation mode
+    BlasMode BlasMode_ = BlasMode::kSymmetric>
+struct DefaultRank2KComplex;
+
+
+////////////////////////////////////////////////////////////////////////////////
+namespace detail {
+
+template <
+  /// Layout type for A matrix operand
+  typename LayoutA_,
+  /// Layout type for B matrix operand
+  typename LayoutB_,
+  /// Complex elementwise transformation 
+  ComplexTransform TransformA,
+  /// Complex elementwise transformation 
+  ComplexTransform TransformB,
+  /// Blas3 computation mode (symmetric/hermitian)
+  BlasMode BlasMode_
+  > struct Rank2KTransposedComplexTransform {
+  
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+
+};
+  
+  // partial specializations for HER2K CUBLAS_OP_N layout (ColumMajor)
+template <>
+  struct Rank2KTransposedComplexTransform <
+  layout::ColumnMajor, layout::ColumnMajor, 
+  ComplexTransform::kNone, ComplexTransform::kNone,
+  BlasMode::kHermitian> {
+
+  static ComplexTransform const kTransformA = ComplexTransform::kConjugate;
+  static ComplexTransform const kTransformB = ComplexTransform::kNone;
+
+};
+
+  // partial specializations for HER2K CUBLAS_OP_C layout (RowMajor + Complex conjugate) 
+template <>
+  struct Rank2KTransposedComplexTransform <
+  layout::RowMajor, layout::RowMajor, 
+  ComplexTransform::kConjugate, ComplexTransform::kConjugate,
+  BlasMode::kHermitian> {
+
+  static ComplexTransform const kTransformA = ComplexTransform::kNone;
+  static ComplexTransform const kTransformB = ComplexTransform::kConjugate;
+
+};
+
+}
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Architecture complex datatype (symmetric)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial>
+struct DefaultRank2KComplex<
+  ElementA, LayoutA, ElementB, LayoutB, ElementC, 
+  layout::RowMajor, FillModeC, ElementAccumulator, arch::OpClassTensorOp,
+  arch::Sm80, ThreadblockShape, WarpShape, InstructionShape, 
+  EpilogueOutputOp, ThreadblockSwizzle, Stages, 
+  TransformA, TransformB, Operator, SplitKSerial, BlasMode::kSymmetric> {
+
+  static BlasMode const kBlasMode = BlasMode::kSymmetric;
+  
+  /// Define the threadblock-scoped matrix multiply-accumulate (A x B^T)
+  using Mma1 = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex<
+      ElementA, LayoutA, 
+      ElementB, typename layout::LayoutTranspose<LayoutB>::type, 
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80, 
+      ThreadblockShape, WarpShape, InstructionShape, Stages, 
+      TransformA, TransformB, Operator>::ThreadblockMma;
+
+  /// Define the threadblock-scoped matrix multiply-accumulate (B x A^T)
+  using Mma2 = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex<
+      ElementB, LayoutB, 
+      ElementA, typename layout::LayoutTranspose<LayoutA>::type, 
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80, 
+      ThreadblockShape, WarpShape, InstructionShape, Stages, 
+      TransformA, TransformB, Operator>::ThreadblockMma;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOpBlas3<
+          ThreadblockShape, typename Mma1::Operator, 1, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, Operator, kBlasMode>::Epilogue;
+
+  /// Define the kernel-level Rank2K operator.
+  using Rank2Kkernel = kernel::Rank2KUniversal<Mma1, Mma2, Epilogue, ThreadblockSwizzle, FillModeC, kBlasMode>;
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Architecture complex datatype (hermitian)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial>
+struct DefaultRank2KComplex<
+  ElementA, LayoutA, ElementB, LayoutB, ElementC, 
+  layout::RowMajor, FillModeC, ElementAccumulator, arch::OpClassTensorOp,
+  arch::Sm80, ThreadblockShape, WarpShape, InstructionShape, 
+  EpilogueOutputOp, ThreadblockSwizzle, Stages, 
+  TransformA, TransformB, Operator, SplitKSerial, BlasMode::kHermitian> {
+
+  static BlasMode const kBlasMode = BlasMode::kHermitian;
+
+  // Complex transform for input A and B matrices (function on input layout)
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+
+  using TransposedComplexTransform = detail::Rank2KTransposedComplexTransform<
+                                        LayoutA, LayoutB, 
+                                        TransformA, TransformB,
+                                        kBlasMode>;
+
+  // Complex transform on operandA and operandB (function of blas3 computation)
+  static ComplexTransform const kTransformOperandA = TransposedComplexTransform::kTransformA;
+  static ComplexTransform const kTransformOperandB = TransposedComplexTransform::kTransformB;
+
+  /// Define the threadblock-scoped matrix multiply-accumulate (A x B^H)
+  using Mma1 = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex<
+      ElementA, LayoutA, 
+      ElementB, typename layout::LayoutTranspose<LayoutB>::type, 
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80, 
+      ThreadblockShape, WarpShape, InstructionShape, Stages, 
+      kTransformOperandA, kTransformOperandB, Operator>::ThreadblockMma;
+
+  /// Define the threadblock-scoped matrix multiply-accumulate (B x A^H)
+  using Mma2 = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex<
+      ElementB, LayoutB, 
+      ElementA, typename layout::LayoutTranspose<LayoutA>::type, 
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80, 
+      ThreadblockShape, WarpShape, InstructionShape, Stages, 
+      kTransformOperandA, kTransformOperandB, Operator>::ThreadblockMma;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOpBlas3<
+          ThreadblockShape, typename Mma1::Operator, 1, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, Operator, kBlasMode>::Epilogue;
+
+  /// Define the kernel-level Rank2K operator.
+  using Rank2Kkernel = kernel::Rank2KUniversal<Mma1, Mma2, Epilogue, ThreadblockSwizzle, FillModeC, kBlasMode>;
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/include/cutlass/gemm/kernel/default_rank_2k_universal.h b/include/cutlass/gemm/kernel/default_rank_2k_universal.h
new file mode 100644
index 00000000..180e6579
--- /dev/null
+++ b/include/cutlass/gemm/kernel/default_rank_2k_universal.h
@@ -0,0 +1,346 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+      Default kernel-level Rank 2k  definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+  
+      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
+      accommodated by exchanging A and B operands and assuming transposed layouts.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+
+#include "cutlass/complex.h"
+#include "cutlass/layout/matrix.h"
+
+#include "cutlass/gemm/kernel/rank_2k_universal.h"
+#include "cutlass/gemm/kernel/default_rank_2k.h"
+#include "cutlass/gemm/kernel/default_rank_2k_complex.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by SYRK
+    typename Operator,
+    /// Blas3 computation mode (symmetric/hermitian)
+    BlasMode BlasMode_ = BlasMode::kSymmetric,
+    ///
+    typename Enable = void
+    >
+struct DefaultRank2KUniversal;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Real-valued Rank 2k update kernels
+//
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by Rank2k
+    typename Operator>
+struct DefaultRank2KUniversal<
+  ElementA,
+  LayoutA,
+  ComplexTransform::kNone,   // transform A
+  kAlignmentA,
+  ElementB,
+  LayoutB,
+  ComplexTransform::kNone,   // transform B
+  kAlignmentB,
+  ElementC,
+  LayoutC,
+  FillModeC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  SplitKSerial,
+  Operator,
+  BlasMode::kSymmetric,
+  typename std::enable_if< ! cutlass::is_complex<ElementAccumulator>::value>::type
+> {
+
+  using DefaultRank2Kkernel = typename kernel::DefaultRank2K<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    ElementC,
+    LayoutC,
+    FillModeC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    SplitKSerial,
+    Operator,
+    BlasMode::kSymmetric
+  >::Rank2Kkernel;
+
+    /// Define the kernel in terms of the default kernel
+  using Rank2Kkernel = kernel::Rank2KUniversal<
+    typename DefaultRank2Kkernel::Mma1,
+    typename DefaultRank2Kkernel::Mma2,
+    typename DefaultRank2Kkernel::Epilogue, 
+    ThreadblockSwizzle,
+    FillModeC,
+    BlasMode::kSymmetric
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Complex-valued Rank 2K update kernels
+//
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by SYRK
+    typename Operator,
+    // BlasMode
+    BlasMode kBlasMode
+  >
+
+struct DefaultRank2KUniversal<
+  ElementA,
+  LayoutA,
+  TransformA,   
+  kAlignmentA,
+  ElementB,
+  LayoutB,
+  TransformB,  
+  kAlignmentB,
+  ElementC,
+  LayoutC,
+  FillModeC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  SplitKSerial,
+  Operator,
+  kBlasMode,
+  typename std::enable_if<cutlass::is_complex<ElementAccumulator>::value>::type
+> {
+
+  using DefaultRank2Kkernel = typename kernel::DefaultRank2KComplex<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    FillModeC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    TransformA,
+    TransformB,
+    Operator,
+    SplitKSerial,
+    kBlasMode
+  >::Rank2Kkernel;
+
+    /// Define the kernel in terms of the default kernel
+  using Rank2Kkernel = kernel::Rank2KUniversal<
+    typename DefaultRank2Kkernel::Mma1,
+    typename DefaultRank2Kkernel::Mma2,
+    typename DefaultRank2Kkernel::Epilogue, 
+    ThreadblockSwizzle,
+    FillModeC,
+    kBlasMode
+  >;
+};
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/kernel/default_rank_k.h b/include/cutlass/gemm/kernel/default_rank_k.h
new file mode 100644
index 00000000..80960f2a
--- /dev/null
+++ b/include/cutlass/gemm/kernel/default_rank_k.h
@@ -0,0 +1,185 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+      Default kernel-level RankK definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/arch/wmma.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/rank_k_universal.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op_blas3.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+#include "cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h"
+#endif //CUTLASS_ARCH_WMMA_ENABLED
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Blas3 computation mode
+    BlasMode BlasMode_ = BlasMode::kSymmetric>
+struct DefaultRankK;
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultRankK<
+                    ElementA, LayoutA, kAlignmentA, 
+                    ElementC,layout::RowMajor, FillModeC, 
+                    ElementAccumulator, arch::OpClassTensorOp, arch::Sm80, 
+                    ThreadblockShape, WarpShape, InstructionShape,
+                    EpilogueOutputOp, ThreadblockSwizzle, Stages, SplitKSerial,
+                    Operator> {
+  /// Define the threadblock-scoped matrix multiply-accumulate (A x AT)
+  using Mma = typename cutlass::gemm::threadblock::DefaultMma<
+      ElementA, LayoutA, 
+      kAlignmentA, 
+      ElementA, typename layout::LayoutTranspose<LayoutA>::type, 
+      kAlignmentA,
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80,
+      ThreadblockShape, WarpShape, InstructionShape, Stages,
+      Operator>::ThreadblockMma;
+  
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOpBlas3<
+          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, BlasMode::kSymmetric>::Epilogue;
+
+  /// Define the kernel-level Rank2 operator.
+  using RankKkernel = kernel::RankKUniversal<Mma, Epilogue, ThreadblockSwizzle, FillModeC>;
+};
+////////////////////////////////////////////////////////////////////////////////
+
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/include/cutlass/gemm/kernel/default_rank_k_complex.h b/include/cutlass/gemm/kernel/default_rank_k_complex.h
new file mode 100644
index 00000000..21d8607b
--- /dev/null
+++ b/include/cutlass/gemm/kernel/default_rank_k_complex.h
@@ -0,0 +1,295 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+      Default kernel-level RankK definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/arch/wmma.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/rank_k_universal.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/default_multistage_mma_complex.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op_blas3.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+#include "cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h"
+#endif //CUTLASS_ARCH_WMMA_ENABLED
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Blas3 computation mode
+    BlasMode BlasMode_ = BlasMode::kSymmetric>
+struct DefaultRankKComplex;
+
+
+////////////////////////////////////////////////////////////////////////////////
+namespace detail {
+
+template <
+  /// Layout type for A matrix operand
+  typename LayoutA_,
+  /// Complex elementwise transformation 
+  ComplexTransform TransformA,
+  /// Blas3 computation mode (symmetric/hermitian)
+  BlasMode BlasMode_
+  > struct RankKTransposedComplexTransform {
+  
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformA;
+
+};
+  
+  // partial specializations for HERK CUBLAS_OP_N layout (ColumMajor)
+template <>
+  struct RankKTransposedComplexTransform <
+  layout::ColumnMajor, 
+  ComplexTransform::kNone,
+  BlasMode::kHermitian> {
+
+  static ComplexTransform const kTransformA = ComplexTransform::kConjugate;
+  static ComplexTransform const kTransformB = ComplexTransform::kNone;
+
+};
+
+  // partial specializations for HERK CUBLAS_OP_C layout (RowMajor + Complex conjugate) 
+template <>
+  struct RankKTransposedComplexTransform <
+  layout::RowMajor, 
+  ComplexTransform::kConjugate,
+  BlasMode::kHermitian> {
+
+  static ComplexTransform const kTransformA = ComplexTransform::kNone;
+  static ComplexTransform const kTransformB = ComplexTransform::kConjugate;
+
+};
+
+}
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Architecture complex datatype (symmetric)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial>
+struct DefaultRankKComplex<
+  ElementA, LayoutA, ElementC, 
+  layout::RowMajor, FillModeC, ElementAccumulator, arch::OpClassTensorOp,
+  arch::Sm80, ThreadblockShape, WarpShape, InstructionShape, 
+  EpilogueOutputOp, ThreadblockSwizzle, Stages, 
+  TransformA, Operator, SplitKSerial, BlasMode::kSymmetric> {
+
+  static BlasMode const kBlasMode = BlasMode::kSymmetric;
+  
+  /// Define the threadblock-scoped matrix multiply-accumulate (A x B^T)
+  using Mma = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex<
+      ElementA, LayoutA, 
+      ElementA, typename layout::LayoutTranspose<LayoutA>::type, 
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80, 
+      ThreadblockShape, WarpShape, InstructionShape, Stages, 
+      TransformA, TransformA, Operator>::ThreadblockMma;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOpBlas3<
+          ThreadblockShape, typename Mma::Operator, 1, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, Operator, kBlasMode>::Epilogue;
+
+  /// Define the kernel-level RankK operator.
+  using RankKkernel = kernel::RankKUniversal<Mma, Epilogue, ThreadblockSwizzle, FillModeC>;
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Architecture complex datatype (hermitian)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial>
+struct DefaultRankKComplex<
+  ElementA, LayoutA, ElementC, 
+  layout::RowMajor, FillModeC, ElementAccumulator, arch::OpClassTensorOp,
+  arch::Sm80, ThreadblockShape, WarpShape, InstructionShape, 
+  EpilogueOutputOp, ThreadblockSwizzle, Stages, 
+  TransformA, Operator, SplitKSerial, BlasMode::kHermitian> {
+
+  static BlasMode const kBlasMode = BlasMode::kHermitian;
+
+  // Complex transform for input A and B matrices (function on input layout)
+  static ComplexTransform const kTransformA = TransformA;
+
+  using TransposedComplexTransform = detail::RankKTransposedComplexTransform<
+                                        LayoutA, 
+                                        TransformA,
+                                        kBlasMode>;
+
+  // Complex transform on operandA and operandB (function of blas3 computation)
+  static ComplexTransform const kTransformOperandA = TransposedComplexTransform::kTransformA;
+  static ComplexTransform const kTransformOperandB = TransposedComplexTransform::kTransformB;
+
+  /// Define the threadblock-scoped matrix multiply-accumulate (A x A^H)
+  using Mma = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex<
+      ElementA, LayoutA, 
+      ElementA, typename layout::LayoutTranspose<LayoutA>::type, 
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80, 
+      ThreadblockShape, WarpShape, InstructionShape, Stages, 
+      kTransformOperandA, kTransformOperandB, Operator>::ThreadblockMma;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOpBlas3<
+          ThreadblockShape, typename Mma::Operator, 1, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, Operator, kBlasMode>::Epilogue;
+
+  /// Define the kernel-level RankK operator.
+  using RankKkernel = kernel::RankKUniversal<Mma, Epilogue, ThreadblockSwizzle, FillModeC>;
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/include/cutlass/gemm/kernel/default_rank_k_universal.h b/include/cutlass/gemm/kernel/default_rank_k_universal.h
new file mode 100644
index 00000000..01181984
--- /dev/null
+++ b/include/cutlass/gemm/kernel/default_rank_k_universal.h
@@ -0,0 +1,305 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+      Default kernel-level Rank k  definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+  
+      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
+      accommodated by exchanging A and B operands and assuming transposed layouts.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+
+#include "cutlass/complex.h"
+#include "cutlass/layout/matrix.h"
+
+#include "cutlass/gemm/kernel/rank_k_universal.h"
+#include "cutlass/gemm/kernel/default_rank_k.h"
+#include "cutlass/gemm/kernel/default_rank_k_complex.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by SYRK
+    typename Operator,
+    /// Blas3 computation mode (symmetric/hermitian)
+    BlasMode BlasMode_ = BlasMode::kSymmetric,
+    ///
+    typename Enable = void
+    >
+struct DefaultRankKUniversal;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Real-valued Rank k update kernels
+//
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by Rank2k
+    typename Operator>
+struct DefaultRankKUniversal<
+  ElementA,
+  LayoutA,
+  ComplexTransform::kNone,   // transform A
+  kAlignmentA,
+  ElementC,
+  LayoutC,
+  FillModeC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  SplitKSerial,
+  Operator,
+  BlasMode::kSymmetric,
+  typename std::enable_if< ! cutlass::is_complex<ElementAccumulator>::value>::type
+> {
+
+  using DefaultRankKkernel = typename kernel::DefaultRankK<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementC,
+    LayoutC,
+    FillModeC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    SplitKSerial,
+    Operator,
+    BlasMode::kSymmetric
+  >::RankKkernel;
+
+    /// Define the kernel in terms of the default kernel
+  using RankKkernel = kernel::RankKUniversal<
+    typename DefaultRankKkernel::Mma,
+    typename DefaultRankKkernel::Epilogue, 
+    ThreadblockSwizzle,
+    FillModeC
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Complex-valued Rank 2K update kernels
+//
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by SYRK
+    typename Operator,
+    // BlasMode
+    BlasMode kBlasMode
+  >
+
+struct DefaultRankKUniversal<
+  ElementA,
+  LayoutA,
+  TransformA,   
+  kAlignmentA,
+  ElementC,
+  LayoutC,
+  FillModeC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  SplitKSerial,
+  Operator,
+  kBlasMode,
+  typename std::enable_if<cutlass::is_complex<ElementAccumulator>::value>::type
+> {
+
+  using DefaultRankKkernel = typename kernel::DefaultRankKComplex<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    FillModeC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    TransformA,
+    Operator,
+    SplitKSerial,
+    kBlasMode
+  >::RankKkernel;
+
+    /// Define the kernel in terms of the default kernel
+  using RankKkernel = kernel::RankKUniversal<
+    typename DefaultRankKkernel::Mma,
+    typename DefaultRankKkernel::Epilogue, 
+    ThreadblockSwizzle,
+    FillModeC
+  >;
+};
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/kernel/default_symm.h b/include/cutlass/gemm/kernel/default_symm.h
new file mode 100755
index 00000000..079bb09b
--- /dev/null
+++ b/include/cutlass/gemm/kernel/default_symm.h
@@ -0,0 +1,227 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+      Default kernel-level SYMM/HEMM definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/arch/wmma.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/symm_universal.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/gemm/threadblock/default_trmm.h"
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+#include "cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h"
+#endif //CUTLASS_ARCH_WMMA_ENABLED
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Side Mode for A (kLeft or kRight)
+    SideMode kSideModeA,
+    /// Fill Mode for A (kLower or kUpper)
+    FillMode kFillModeA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Blas3 computation mode
+    BlasMode BlasMode_ = BlasMode::kSymmetric>
+struct DefaultSymm;
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Side Mode for A (kLeft or kRight)
+    SideMode kSideModeA,
+    /// Fill Mode for A (kLower or kUpper)
+    FillMode kFillModeA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultSymm<
+                    ElementA, LayoutA, kSideModeA, kFillModeA, kAlignmentA, 
+                    ElementB, LayoutB, kAlignmentB, 
+                    ElementC,layout::RowMajor, 
+                    ElementAccumulator, arch::OpClassTensorOp, arch::Sm80, 
+                    ThreadblockShape, WarpShape, InstructionShape,
+                    EpilogueOutputOp, ThreadblockSwizzle, Stages, SplitKSerial,
+                    Operator> {
+
+  /// Define the threadblock-scoped triagular matrix multiply-accumulate
+  /// TRMM - with diagonal: alpha * A * B or alpha * B * A
+	static const DiagType kDiagTypeMma1 = DiagType::kNonUnit;
+  using Mma1 = typename cutlass::gemm::threadblock::DefaultTrmm<
+      ElementA, LayoutA, kAlignmentA, 
+      ElementB, LayoutB, kAlignmentB,
+      kSideModeA, kFillModeA, kDiagTypeMma1, 
+      ElementAccumulator, layout::RowMajor, 
+      arch::OpClassTensorOp, arch::Sm80,
+      ThreadblockShape, WarpShape, InstructionShape,
+      Stages, Operator>::ThreadblockMma;
+
+  /// Define the threadblock-scoped triagular matrix multiply-accumulate 
+  /// TRMM - withOUT diagonal: alpha * AT * B or alpha * B * AT
+	static const DiagType kDiagTypeMma2 = DiagType::kZero;
+  using LayoutAMma2 = typename platform::conditional<
+                                (kSideModeA == SideMode::kLeft), 
+                                typename layout::LayoutTranspose<LayoutA>::type, 
+                                LayoutA
+                              >::type;
+  using LayoutBMma2 = typename platform::conditional<
+                                (kSideModeA == SideMode::kLeft), 
+                                LayoutB, 
+                                typename layout::LayoutTranspose<LayoutB>::type
+                              >::type; 
+	using Mma2 = typename cutlass::gemm::threadblock::DefaultTrmm<
+			ElementA, LayoutAMma2, kAlignmentA, 
+			ElementB, LayoutBMma2, kAlignmentB,
+			kSideModeA, InvertFillMode<kFillModeA>::mode, kDiagTypeMma2, 
+			ElementAccumulator, layout::RowMajor, 
+			arch::OpClassTensorOp, arch::Sm80,
+			ThreadblockShape, WarpShape, InstructionShape,
+			Stages, Operator>::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+          ThreadblockShape, typename Mma1::Operator, kPartitionsK, EpilogueOutputOp,
+          EpilogueOutputOp::kCount>::Epilogue;
+
+  /// Define the kernel-level SYMM/HEMM operator.
+  using SymmKernel = kernel::SymmUniversal<Mma1, Mma2, Epilogue, ThreadblockSwizzle, kSideModeA, kFillModeA>;
+};
+////////////////////////////////////////////////////////////////////////////////
+
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/include/cutlass/gemm/kernel/default_symm_complex.h b/include/cutlass/gemm/kernel/default_symm_complex.h
new file mode 100755
index 00000000..a8b5d218
--- /dev/null
+++ b/include/cutlass/gemm/kernel/default_symm_complex.h
@@ -0,0 +1,316 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+      Default kernel-level SYMM/HEMM definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/arch/wmma.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/symm_universal.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/default_multistage_trmm_complex.h"
+#include "cutlass/gemm/threadblock/default_multistage_mma_complex.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+#include "cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h"
+#endif //CUTLASS_ARCH_WMMA_ENABLED
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Side Mode for A (kLeft or kRight)
+    SideMode kSideModeA,
+    /// Fill Mode for A (kLower or kUpper)
+    FillMode kFillModeA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Blas3 computation mode
+    BlasMode BlasMode_ = BlasMode::kSymmetric>
+struct DefaultSymmComplex;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Architecture complex datatype (symmetric)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Side Mode for A (kLeft or kRight)
+    SideMode kSideModeA,
+    /// Fill Mode for A (kLower or kUpper)
+    FillMode kFillModeA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial>
+struct DefaultSymmComplex<
+  ElementA, LayoutA, kSideModeA, kFillModeA, ElementB, LayoutB, ElementC, 
+  layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
+  arch::Sm80, ThreadblockShape, WarpShape, InstructionShape, 
+  EpilogueOutputOp, ThreadblockSwizzle, Stages, 
+  Operator, SplitKSerial, BlasMode::kSymmetric> {
+
+  static BlasMode const kBlasMode = BlasMode::kSymmetric;
+  // Complex Transform don't appply to A or B for SYMM
+  static ComplexTransform const TransformA = ComplexTransform::kNone; 
+  static ComplexTransform const TransformB = ComplexTransform::kNone; 
+
+  /// Define the threadblock-scoped triagular matrix multiply-accumulate
+  /// TRMM - with diagonal: alpha * A * B or alpha * B * A
+	static const DiagType kDiagTypeMma1 = DiagType::kNonUnit;
+  using Mma1 = typename cutlass::gemm::threadblock::DefaultMultistageTrmmComplex<
+      ElementA, LayoutA, 
+      ElementB, LayoutB, 
+      kSideModeA, kFillModeA, kDiagTypeMma1, 
+      ElementAccumulator, layout::RowMajor, 
+      arch::OpClassTensorOp, arch::Sm80,
+      ThreadblockShape, WarpShape, InstructionShape,
+      Stages, TransformA, TransformB, Operator>::ThreadblockMma;
+
+  /// Define the threadblock-scoped triagular matrix multiply-accumulate
+  /// TRMM - withOUT diagonal: alpha * AT * B or alpha * B * AT
+	static const DiagType kDiagTypeMma2 = DiagType::kZero;
+  using LayoutAMma2 = typename platform::conditional<
+                                (kSideModeA == SideMode::kLeft), 
+                                typename layout::LayoutTranspose<LayoutA>::type, 
+                                LayoutA
+                              >::type;
+  using LayoutBMma2 = typename platform::conditional<
+                                (kSideModeA == SideMode::kLeft), 
+                                LayoutB, 
+                                typename layout::LayoutTranspose<LayoutB>::type
+                              >::type; 
+	using Mma2 = typename cutlass::gemm::threadblock::DefaultMultistageTrmmComplex<
+			ElementA, LayoutAMma2, 
+			ElementB, LayoutBMma2, 
+			kSideModeA, InvertFillMode<kFillModeA>::mode, kDiagTypeMma2, 
+			ElementAccumulator, layout::RowMajor, 
+			arch::OpClassTensorOp, arch::Sm80,
+			ThreadblockShape, WarpShape, InstructionShape,
+			Stages, TransformA, TransformB, Operator>::ThreadblockMma;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOp<
+          ThreadblockShape, typename Mma1::Operator, 1, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, Operator>::Epilogue;
+
+  /// Define the kernel-level Symm operator.
+  using SymmKernel = kernel::SymmUniversal<Mma1, Mma2, Epilogue, ThreadblockSwizzle, kSideModeA, kFillModeA>;
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Architecture complex datatype (hermitian)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Side Mode for A (kLeft or kRight)
+    SideMode kSideModeA,
+    /// Fill Mode for A (kLower or kUpper)
+    FillMode kFillModeA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial>
+struct DefaultSymmComplex<
+  ElementA, LayoutA, kSideModeA, kFillModeA, ElementB, LayoutB, ElementC, 
+  layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
+  arch::Sm80, ThreadblockShape, WarpShape, InstructionShape, 
+  EpilogueOutputOp, ThreadblockSwizzle, Stages, 
+  Operator, SplitKSerial, BlasMode::kHermitian> {
+
+  static BlasMode const kBlasMode = BlasMode::kHermitian;
+
+
+  /// Define the threadblock-scoped triagular matrix multiply-accumulate
+  /// TRMM - with diagonal: alpha * A * B or alpha * B * A
+	static const DiagType kDiagTypeMma1 = DiagType::kNonUnit;
+  static ComplexTransform const TransformAMma1 = ComplexTransform::kNone; 
+  static ComplexTransform const TransformBMma1 = ComplexTransform::kNone; 
+  using Mma1 = typename cutlass::gemm::threadblock::DefaultMultistageTrmmComplex<
+      ElementA, LayoutA, 
+      ElementB, LayoutB, 
+      kSideModeA, kFillModeA, kDiagTypeMma1, 
+      ElementAccumulator, layout::RowMajor, 
+      arch::OpClassTensorOp, arch::Sm80,
+      ThreadblockShape, WarpShape, InstructionShape,
+      Stages, TransformAMma1, TransformBMma1, Operator, BlasMode::kHermitian>::ThreadblockMma;
+
+  /// Define the threadblock-scoped triagular matrix multiply-accumulate
+  /// TRMM - withOUT diagonal - with conjugate transpose: alpha * AT * B or alpha * B * AT
+	static const DiagType kDiagTypeMma2 = DiagType::kZero;
+  using LayoutAMma2 = typename platform::conditional<
+                                (kSideModeA == SideMode::kLeft), 
+                                typename layout::LayoutTranspose<LayoutA>::type, 
+                                LayoutA
+                              >::type;
+  using LayoutBMma2 = typename platform::conditional<
+                                (kSideModeA == SideMode::kLeft), 
+                                LayoutB, 
+                                typename layout::LayoutTranspose<LayoutB>::type
+                              >::type;
+  static ComplexTransform const TransformAMma2 = (kSideModeA == SideMode::kLeft) ? 
+                                              ComplexTransform::kConjugate : ComplexTransform::kNone;
+  static ComplexTransform const TransformBMma2 = (kSideModeA == SideMode::kLeft) ? 
+                                              ComplexTransform::kNone : ComplexTransform::kConjugate;
+
+	using Mma2 = typename cutlass::gemm::threadblock::DefaultMultistageTrmmComplex<
+			ElementA, LayoutAMma2, 
+			ElementB, LayoutBMma2, 
+			kSideModeA, InvertFillMode<kFillModeA>::mode, kDiagTypeMma2, 
+			ElementAccumulator, layout::RowMajor, 
+			arch::OpClassTensorOp, arch::Sm80,
+			ThreadblockShape, WarpShape, InstructionShape,
+			Stages, TransformAMma2, TransformBMma2, Operator>::ThreadblockMma;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOp<
+          ThreadblockShape, typename Mma1::Operator, 1, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, Operator>::Epilogue;
+
+  /// Define the kernel-level Symm operator.
+  using SymmKernel = kernel::SymmUniversal<Mma1, Mma2, Epilogue, ThreadblockSwizzle, kSideModeA, kFillModeA>;
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/include/cutlass/gemm/kernel/default_symm_universal.h b/include/cutlass/gemm/kernel/default_symm_universal.h
new file mode 100755
index 00000000..f3bd5552
--- /dev/null
+++ b/include/cutlass/gemm/kernel/default_symm_universal.h
@@ -0,0 +1,342 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+      Default kernel-level SYMM/HEMM definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+  
+      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
+      accommodated by exchanging A and B operands and assuming transposed layouts.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+
+#include "cutlass/complex.h"
+#include "cutlass/layout/matrix.h"
+
+#include "cutlass/gemm/kernel/symm_universal.h"
+#include "cutlass/gemm/kernel/default_symm.h"
+#include "cutlass/gemm/kernel/default_symm_complex.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Side Mode for A (kLeft or kRight)
+    SideMode SideModeA,
+    /// Fill Mode for A (kLower or kUpper)
+    FillMode FillModeA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by SYRK
+    typename Operator,
+    /// Blas3 computation mode (symmetric/hermitian)
+    BlasMode BlasMode_ = BlasMode::kSymmetric,
+    ///
+    typename Enable = void
+    >
+struct DefaultSymmUniversal;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Real-valued SYMM/HEMM update kernels
+//
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Side Mode for A (kLeft or kRight)
+    SideMode SideModeA,
+    /// Fill Mode for A (kLower or kUpper)
+    FillMode FillModeA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by SYMM/HEMM
+    typename Operator>
+struct DefaultSymmUniversal<
+  ElementA,
+  LayoutA,
+  SideModeA,
+  FillModeA,
+  kAlignmentA,
+  ElementB,
+  LayoutB,
+  kAlignmentB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  SplitKSerial,
+  Operator,
+  BlasMode::kSymmetric,
+  typename std::enable_if< ! cutlass::is_complex<ElementAccumulator>::value>::type
+> {
+
+  using DefaultSymmkernel = typename kernel::DefaultSymm<
+    ElementA,
+    LayoutA,
+    SideModeA,
+    FillModeA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    SplitKSerial,
+    Operator,
+    BlasMode::kSymmetric
+  >::SymmKernel;
+
+    /// Define the kernel in terms of the default kernel
+  using SymmKernel = kernel::SymmUniversal<
+    typename DefaultSymmkernel::Mma1,
+    typename DefaultSymmkernel::Mma2,
+    typename DefaultSymmkernel::Epilogue, 
+    ThreadblockSwizzle,
+    SideModeA,
+    FillModeA
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Complex-valued SYMM/HEMM update kernels
+//
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Side Mode for A (kLeft or kRight)
+    SideMode SideModeA,
+    /// Fill Mode for A (kLower or kUpper)
+    FillMode FillModeA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by SYRK
+    typename Operator,
+    // BlasMode
+    BlasMode kBlasMode
+  >
+
+struct DefaultSymmUniversal<
+  ElementA,
+  LayoutA,
+  SideModeA,
+  FillModeA, 
+  kAlignmentA,
+  ElementB,
+  LayoutB,
+  kAlignmentB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  SplitKSerial,
+  Operator,
+  kBlasMode,
+  typename std::enable_if<cutlass::is_complex<ElementAccumulator>::value>::type
+> {
+
+  using DefaultSymmkernel = typename kernel::DefaultSymmComplex<
+    ElementA,
+    LayoutA,
+    SideModeA,
+    FillModeA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    Operator,
+    SplitKSerial,
+    kBlasMode
+  >::SymmKernel;
+
+    /// Define the kernel in terms of the default kernel
+  using SymmKernel = kernel::SymmUniversal<
+    typename DefaultSymmkernel::Mma1,
+    typename DefaultSymmkernel::Mma2,
+    typename DefaultSymmkernel::Epilogue, 
+    ThreadblockSwizzle,
+    SideModeA,
+    FillModeA
+  >;
+};
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/kernel/default_trmm.h b/include/cutlass/gemm/kernel/default_trmm.h
new file mode 100644
index 00000000..a3d6b9ef
--- /dev/null
+++ b/include/cutlass/gemm/kernel/default_trmm.h
@@ -0,0 +1,199 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+// 
+/*! \file
+    \brief 
+      Default kernel-level TRMM definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/arch/wmma.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/trmm_universal.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/default_trmm.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+#include "cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h"
+#endif //CUTLASS_ARCH_WMMA_ENABLED
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Side Mode for the kernel
+    SideMode SideMode_,
+    /// Fill Mode for the triangular matrix
+    FillMode FillMode_,
+    /// Diag Type for the triangular matrix
+    DiagType DiagType_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultTrmm;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Side Mode for the kernel
+    SideMode kSideMode,
+    /// Fill Mode for the triangular matrix
+    FillMode kFillMode,
+    /// Diag Type for the triangular matrix
+    DiagType kDiagType,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultTrmm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+                   kSideMode, kFillMode, kDiagType, ElementC,
+                   layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
+                   arch::Sm80, ThreadblockShape, WarpShape, InstructionShape,
+                   EpilogueOutputOp, ThreadblockSwizzle, Stages, SplitKSerial,
+                   Operator> {
+                    
+  /// Define the threadblock-scoped triagular matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultTrmm<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+      kSideMode, kFillMode, kDiagType, 
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80,
+      ThreadblockShape, WarpShape, InstructionShape, Stages,
+      Operator>::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
+          EpilogueOutputOp::kCount>::Epilogue;
+
+  /// Define the kernel-level TRMM operator.
+  using TrmmKernel = kernel::TrmmUniversal<Mma, Epilogue, ThreadblockSwizzle, kSideMode, kFillMode, kDiagType>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/include/cutlass/gemm/kernel/default_trmm_complex.h b/include/cutlass/gemm/kernel/default_trmm_complex.h
new file mode 100644
index 00000000..9dcb5a2f
--- /dev/null
+++ b/include/cutlass/gemm/kernel/default_trmm_complex.h
@@ -0,0 +1,197 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+      Default kernel-level TRMM definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+  
+      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
+      accommodated by exchanging A and B operands and assuming transposed layouts.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+
+#include "cutlass/layout/matrix.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/trmm_universal.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h"
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/default_multistage_trmm_complex.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+  /// Element type for A matrix operand
+  typename ElementA_,
+  /// Layout type for A matrix operand
+  typename LayoutA_,
+  /// Element type for B matrix operand
+  typename ElementB_,
+  /// Layout type for B matrix operand
+  typename LayoutB_,
+  /// Side Mode for the kernel
+  SideMode SideMode_,
+  /// Fill Mode for the triangular matrix
+  FillMode FillMode_,
+  /// Diag Type for the triangular matrix
+  DiagType DiagType_,
+  /// Element type for C and D matrix operands
+  typename ElementC_,
+  /// Layout type for C and D matrix operands
+  typename LayoutC_,
+  /// Element type for internal accumulation
+  typename ElementAccumulator,
+  /// Operator class tag
+  typename OperatorClass,
+  /// Tag indicating architecture to tune for
+  typename ArchTag,
+  /// Threadblock-level tile size (concept: GemmShape)
+  typename ThreadblockShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename WarpShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename InstructionShape,
+  /// Epilogue output operator
+  typename EpilogueOutputOp,
+  /// Threadblock-level swizzling operator
+  typename ThreadblockSwizzle,
+  /// Number of stages used in the pipelined mainloop
+  int Stages,
+  /// Complex elementwise transformation on A operand
+  ComplexTransform TransformA,
+  /// Complex elementwise transformation on B operand
+  ComplexTransform TransformB,
+  /// Multiply-add operator 
+  // (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+  typename Operator,
+  /// If true, kernel is configured to support serial reduction in the epilogue
+  bool SplitKSerial
+>
+struct DefaultTrmmComplex;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Side Mode for the kernel
+    SideMode kSideMode,
+    /// Fill Mode for the triangular matrix
+    FillMode kFillMode,
+    /// Diag Type for the triangular matrix
+    DiagType kDiagType,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Multiply-add operator 
+    // (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the epilogue
+    bool SplitKSerial
+  >
+struct DefaultTrmmComplex<
+  ElementA, LayoutA, ElementB, LayoutB, 
+  kSideMode, kFillMode, kDiagType,
+  ElementC, layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
+  arch::Sm80, ThreadblockShape, WarpShape, InstructionShape,
+  EpilogueOutputOp, ThreadblockSwizzle, Stages, TransformA, TransformB, Operator, SplitKSerial> {
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultMultistageTrmmComplex<
+      ElementA, LayoutA, ElementB, LayoutB, 
+      kSideMode, kFillMode, kDiagType,
+      ElementAccumulator,layout::RowMajor, arch::OpClassTensorOp, arch::Sm80, ThreadblockShape,
+      WarpShape, InstructionShape, Stages, TransformA, TransformB, Operator>::ThreadblockMma;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOp<
+          ThreadblockShape, typename Mma::Operator, 1, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, Operator>::Epilogue;
+
+  /// Define the kernel-level TRMM operator.
+  using TrmmKernel = kernel::TrmmUniversal<Mma, Epilogue, ThreadblockSwizzle, kSideMode, kFillMode, kDiagType>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/kernel/default_trmm_universal.h b/include/cutlass/gemm/kernel/default_trmm_universal.h
new file mode 100644
index 00000000..58c2e814
--- /dev/null
+++ b/include/cutlass/gemm/kernel/default_trmm_universal.h
@@ -0,0 +1,359 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+      Default kernel-level TRMM definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+  
+      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
+      accommodated by exchanging A and B operands and assuming transposed layouts.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+
+#include "cutlass/complex.h"
+#include "cutlass/layout/matrix.h"
+
+#include "cutlass/gemm/kernel/trmm_universal.h"
+#include "cutlass/gemm/kernel/default_trmm.h"
+#include "cutlass/gemm/kernel/default_trmm_complex.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Side Mode for the kernel
+    SideMode kSideMode,
+    /// Fill Mode for the triangular matrix
+    FillMode kFillMode,
+    /// Diag Type for the triangular matrix
+    DiagType kDiagType,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by TRMM
+    typename Operator,
+    ///
+    typename Enable = void
+    >
+struct DefaultTrmmUniversal;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Real-valued TRMM kernels
+//
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Side Mode for the kernel
+    SideMode kSideMode,
+    /// Fill Mode for the triangular matrix
+    FillMode kFillMode,
+    /// Diag Type for the triangular matrix
+    DiagType kDiagType,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by TRMM
+    typename Operator>
+struct DefaultTrmmUniversal<
+  ElementA,
+  LayoutA,
+  ComplexTransform::kNone,   // transform A
+  kAlignmentA,
+  ElementB,
+  LayoutB,
+  ComplexTransform::kNone,   // transform B
+  kAlignmentB,
+  kSideMode,
+  kFillMode,
+  kDiagType,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  SplitKSerial,
+  Operator,
+  typename std::enable_if< ! cutlass::is_complex<ElementAccumulator>::value>::type
+> {
+
+  using DefaultTrmmKernel = typename kernel::DefaultTrmm<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    kSideMode,
+    kFillMode,
+    kDiagType,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    SplitKSerial,
+    Operator
+  >::TrmmKernel;
+
+    /// Define the kernel in terms of the default kernel
+  using TrmmKernel = kernel::TrmmUniversal<
+    typename DefaultTrmmKernel::Mma,
+    typename DefaultTrmmKernel::Epilogue, 
+    ThreadblockSwizzle,
+    kSideMode,
+    kFillMode,
+    kDiagType
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Complex-valued TRMM kernels
+//
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Side Mode for the kernel
+    SideMode kSideMode,
+    /// Fill Mode for the triangular matrix
+    FillMode kFillMode,
+    /// Diag Type for the triangular matrix
+    DiagType kDiagType,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by TRMM
+    typename Operator
+  >
+struct DefaultTrmmUniversal<
+  ElementA,
+  LayoutA,
+  TransformA,
+  kAlignmentA,
+  ElementB,
+  LayoutB,
+  TransformB,
+  kAlignmentB,
+  kSideMode,
+  kFillMode,
+  kDiagType,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  SplitKSerial,
+  Operator,
+  typename std::enable_if<cutlass::is_complex<ElementAccumulator>::value>::type
+> {
+
+  using DefaultTrmmKernel = typename kernel::DefaultTrmmComplex<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    kSideMode,
+    kFillMode,
+    kDiagType,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    TransformA,
+    TransformB,
+    Operator,
+    SplitKSerial
+  >::TrmmKernel;
+
+  /// Define the kernel in terms of the default kernel
+  using TrmmKernel = kernel::TrmmUniversal<
+    typename DefaultTrmmKernel::Mma,
+    typename DefaultTrmmKernel::Epilogue, 
+    ThreadblockSwizzle,
+    kSideMode,
+    kFillMode,
+    kDiagType
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/kernel/gemm.h b/include/cutlass/gemm/kernel/gemm.h
index e3d226fd..02676ede 100644
--- a/include/cutlass/gemm/kernel/gemm.h
+++ b/include/cutlass/gemm/kernel/gemm.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -78,6 +84,10 @@ struct Gemm {
     typename OutputOp::Params output_op;
     int *semaphore;
     int gemm_k_size;
+    // For gather+scatter operations
+    int const *gather_A_indices;
+    int const *gather_B_indices;
+    int const *scatter_D_indices;
 
     //
     // Methods
@@ -95,7 +105,10 @@ struct Gemm {
       typename Epilogue::OutputTileIterator::TensorRef ref_C,
       typename Epilogue::OutputTileIterator::TensorRef ref_D,
       typename OutputOp::Params output_op = typename OutputOp::Params(),
-      int *workspace = nullptr
+      int *workspace = nullptr,
+      int const *gather_A_indices = nullptr,
+      int const *gather_B_indices = nullptr,
+      int const *scatter_D_indices = nullptr
     ):
       problem_size(problem_size),
       grid_tiled_shape(grid_tiled_shape),
@@ -108,7 +121,10 @@ struct Gemm {
       ref_C(ref_C),
       params_D(ref_D.layout()),
       ref_D(ref_D),
-      output_op(output_op) {
+      output_op(output_op),
+      gather_A_indices(gather_A_indices),
+      gather_B_indices(gather_B_indices),
+      scatter_D_indices(scatter_D_indices) {
 
       int total_gemm_k_iterations = (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
       int gemm_k_iterations = (total_gemm_k_iterations + grid_tiled_shape.k() - 1) / grid_tiled_shape.k();
@@ -133,6 +149,7 @@ struct Gemm {
   Gemm() { } 
 
   /// Determines whether kernel satisfies alignment
+  CUTLASS_HOST_DEVICE
   static Status can_implement(
     cutlass::gemm::GemmCoord const & problem_size,
     typename Mma::IteratorA::TensorRef ref_A,
@@ -154,7 +171,13 @@ struct Gemm {
                                                         layout::RowMajorInterleaved<64>>::value)
                                      ? 64
                                      : Mma::IteratorB::AccessType::kElements;
-    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+    static int const kAlignmentC = (platform::is_same<typename Epilogue::OutputTileIterator::Layout,
+                                                      layout::ColumnMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (platform::is_same<typename Epilogue::OutputTileIterator::Layout,
+                                                        layout::ColumnMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Epilogue::OutputTileIterator::kElementsPerAccess;
 
     if (!TensorRef_aligned(ref_A, kAlignmentA)) {
       return Status::kErrorMisalignedOperand;
@@ -172,13 +195,6 @@ struct Gemm {
       return Status::kErrorMisalignedOperand;
     }
 
-    if ((problem_size.m() % kAlignmentA) || (problem_size.k() % kAlignmentA) ||
-      (problem_size.n() % kAlignmentB) || (problem_size.k() % kAlignmentB) ||
-      (problem_size.m() % kAlignmentC) || (problem_size.n() % kAlignmentC)) {
-
-      return Status::kErrorMisalignedOperand;
-    }
-
     return Status::kSuccess;
   }
 
@@ -227,14 +243,16 @@ struct Gemm {
       params.ref_A.data(),
       {params.problem_size.m(), problem_size_k},
       thread_idx,
-      tb_offset_A);
+      tb_offset_A,
+      params.gather_A_indices);
 
     typename Mma::IteratorB iterator_B(
       params.params_B,
       params.ref_B.data(),
       {problem_size_k, params.problem_size.n()},
       thread_idx,
-      tb_offset_B);
+      tb_offset_B,
+      params.gather_B_indices);
 
     // Broadcast the warp_id computed by lane 0 to ensure dependent code
     // is compiled as warp-uniform.
@@ -297,7 +315,8 @@ struct Gemm {
       params.ref_C.data(),
       params.problem_size.mn(),
       thread_idx,
-      threadblock_offset
+      threadblock_offset,
+      params.scatter_D_indices
     );
 
     // Tile iterator writing to destination tensor.
@@ -306,7 +325,8 @@ struct Gemm {
       params.ref_D.data(),
       params.problem_size.mn(),
       thread_idx,
-      threadblock_offset
+      threadblock_offset,
+      params.scatter_D_indices
     );
 
     Epilogue epilogue(
diff --git a/include/cutlass/gemm/kernel/gemm_array.h b/include/cutlass/gemm/kernel/gemm_array.h
index 399bba9c..0d8a9aad 100644
--- a/include/cutlass/gemm/kernel/gemm_array.h
+++ b/include/cutlass/gemm/kernel/gemm_array.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/kernel/gemm_batched.h b/include/cutlass/gemm/kernel/gemm_batched.h
index 4edb17c8..7a9f7af3 100644
--- a/include/cutlass/gemm/kernel/gemm_batched.h
+++ b/include/cutlass/gemm/kernel/gemm_batched.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/kernel/gemm_grouped.h b/include/cutlass/gemm/kernel/gemm_grouped.h
index c9fe2c3e..ceca1f7d 100644
--- a/include/cutlass/gemm/kernel/gemm_grouped.h
+++ b/include/cutlass/gemm/kernel/gemm_grouped.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -548,7 +554,7 @@ struct GemmGrouped {
 
       // Wait for all threads to finish their epilogue phases from the previous tile.
       __syncthreads();
-      
+
       // Compute threadblock-scoped matrix multiply-add
       mma(
         gemm_k_iterations, 
diff --git a/include/cutlass/gemm/kernel/gemm_params.h b/include/cutlass/gemm/kernel/gemm_params.h
index 0fd0b58c..31b54d4d 100755
--- a/include/cutlass/gemm/kernel/gemm_params.h
+++ b/include/cutlass/gemm/kernel/gemm_params.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/kernel/gemm_pipelined.h b/include/cutlass/gemm/kernel/gemm_pipelined.h
index 63674567..ee28b476 100644
--- a/include/cutlass/gemm/kernel/gemm_pipelined.h
+++ b/include/cutlass/gemm/kernel/gemm_pipelined.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/kernel/gemm_planar_complex.h b/include/cutlass/gemm/kernel/gemm_planar_complex.h
index dbba1cf8..7b85fdbd 100644
--- a/include/cutlass/gemm/kernel/gemm_planar_complex.h
+++ b/include/cutlass/gemm/kernel/gemm_planar_complex.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -405,10 +411,29 @@ struct GemmPlanarComplex {
     static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
     static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
 
-    if ((args.problem_size.m() % kAlignmentA) || (args.problem_size.k() % kAlignmentA) ||
-      (args.problem_size.n() % kAlignmentB) || (args.problem_size.k() % kAlignmentB) ||
-      (args.problem_size.m() % kAlignmentC) || (args.problem_size.n() % kAlignmentC)) {
+    bool isAMisaligned = false;
+    bool isBMisaligned = false;
+    bool isCMisaligned = false;
+
+    if (platform::is_same<LayoutA, layout::RowMajor>::value) {
+      isAMisaligned = args.problem_size.k() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajor>::value) {
+      isAMisaligned = args.problem_size.m() % kAlignmentA;
+    }
+
+    if (platform::is_same<LayoutB, layout::RowMajor>::value) {
+      isBMisaligned = args.problem_size.n() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::ColumnMajor>::value) {
+      isBMisaligned = args.problem_size.k() % kAlignmentB;
+    }
+
+    if (platform::is_same<LayoutC, layout::RowMajor>::value) {
+      isCMisaligned = args.problem_size.n() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajor>::value) {
+      isCMisaligned = args.problem_size.m() % kAlignmentC;
+    }
 
+    if (isAMisaligned || isBMisaligned || isCMisaligned) {
       return Status::kErrorMisalignedOperand;
     }
 
diff --git a/include/cutlass/gemm/kernel/gemm_planar_complex_array.h b/include/cutlass/gemm/kernel/gemm_planar_complex_array.h
index 5cd096b6..1360a2c0 100644
--- a/include/cutlass/gemm/kernel/gemm_planar_complex_array.h
+++ b/include/cutlass/gemm/kernel/gemm_planar_complex_array.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -361,10 +367,29 @@ struct GemmPlanarComplexArray {
     static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
     static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
 
-    if ((args.problem_size.m() % kAlignmentA) || (args.problem_size.k() % kAlignmentA) ||
-      (args.problem_size.n() % kAlignmentB) || (args.problem_size.k() % kAlignmentB) ||
-      (args.problem_size.m() % kAlignmentC) || (args.problem_size.n() % kAlignmentC)) {
+    bool isAMisaligned = false;
+    bool isBMisaligned = false;
+    bool isCMisaligned = false;
+
+    if (platform::is_same<LayoutA, layout::RowMajor>::value) {
+      isAMisaligned = args.problem_size.k() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajor>::value) {
+      isAMisaligned = args.problem_size.m() % kAlignmentA;
+    }
+
+    if (platform::is_same<LayoutB, layout::RowMajor>::value) {
+      isBMisaligned = args.problem_size.n() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::ColumnMajor>::value) {
+      isBMisaligned = args.problem_size.k() % kAlignmentB;
+    }
+
+    if (platform::is_same<LayoutC, layout::RowMajor>::value) {
+      isCMisaligned = args.problem_size.n() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajor>::value) {
+      isCMisaligned = args.problem_size.m() % kAlignmentC;
+    }
 
+    if (isAMisaligned || isBMisaligned || isCMisaligned) {
       return Status::kErrorMisalignedOperand;
     }
 
diff --git a/include/cutlass/gemm/kernel/gemm_splitk_parallel.h b/include/cutlass/gemm/kernel/gemm_splitk_parallel.h
index 9b310eb3..f9b117fd 100644
--- a/include/cutlass/gemm/kernel/gemm_splitk_parallel.h
+++ b/include/cutlass/gemm/kernel/gemm_splitk_parallel.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/kernel/gemm_transpose_operands.h b/include/cutlass/gemm/kernel/gemm_transpose_operands.h
index 69e25eee..ab80f4da 100644
--- a/include/cutlass/gemm/kernel/gemm_transpose_operands.h
+++ b/include/cutlass/gemm/kernel/gemm_transpose_operands.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/kernel/gemm_universal.h b/include/cutlass/gemm/kernel/gemm_universal.h
index d06822a2..c11e308a 100644
--- a/include/cutlass/gemm/kernel/gemm_universal.h
+++ b/include/cutlass/gemm/kernel/gemm_universal.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -127,6 +133,10 @@ struct GemmUniversal {
     typename LayoutC::Stride::LongIndex ldc;
     typename LayoutC::Stride::LongIndex ldd;
 
+    int const * ptr_gather_A_indices;
+    int const * ptr_gather_B_indices;
+    int const * ptr_scatter_D_indices;
+
     //
     // Methods
     //
@@ -134,7 +144,10 @@ struct GemmUniversal {
     Arguments(): 
       mode(GemmUniversalMode::kGemm), 
       batch_count(1), 
-      ptr_A(nullptr), ptr_B(nullptr), ptr_C(nullptr), ptr_D(nullptr) { }
+      ptr_A(nullptr), ptr_B(nullptr), ptr_C(nullptr), ptr_D(nullptr),
+      ptr_gather_A_indices(ptr_gather_A_indices),
+      ptr_gather_B_indices(ptr_gather_B_indices),
+      ptr_scatter_D_indices(ptr_scatter_D_indices) {}
 
     /// constructs an arguments structure
     Arguments(
@@ -153,7 +166,10 @@ struct GemmUniversal {
       typename LayoutA::Stride stride_a,
       typename LayoutB::Stride stride_b,
       typename LayoutC::Stride stride_c,
-      typename LayoutC::Stride stride_d
+      typename LayoutC::Stride stride_d,
+      int const *ptr_gather_A_indices = nullptr,
+      int const *ptr_gather_B_indices = nullptr,
+      int const *ptr_scatter_D_indices = nullptr
     ):
       mode(mode), 
       problem_size(problem_size), 
@@ -161,12 +177,13 @@ struct GemmUniversal {
       epilogue(epilogue), 
       ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D), 
       batch_stride_A(batch_stride_A), batch_stride_B(batch_stride_B), batch_stride_C(batch_stride_C), batch_stride_D(batch_stride_D), 
-      stride_a(stride_a), stride_b(stride_b), stride_c(stride_c), stride_d(stride_d) {
+      stride_a(stride_a), stride_b(stride_b), stride_c(stride_c), stride_d(stride_d),
+      ptr_gather_A_indices(ptr_gather_A_indices), ptr_gather_B_indices(ptr_gather_B_indices),
+      ptr_scatter_D_indices(ptr_scatter_D_indices) {
       lda = 0;
       ldb = 0;
       ldc = 0;
       ldd = 0;
-
       CUTLASS_TRACE_HOST("GemmUniversal::Arguments::Arguments() - problem_size: " << problem_size);
       }
 
@@ -187,7 +204,10 @@ struct GemmUniversal {
       typename LayoutA::Stride::LongIndex lda,
       typename LayoutB::Stride::LongIndex ldb,
       typename LayoutC::Stride::LongIndex ldc,
-      typename LayoutC::Stride::LongIndex ldd
+      typename LayoutC::Stride::LongIndex ldd,
+      int const *ptr_gather_A_indices = nullptr,
+      int const *ptr_gather_B_indices = nullptr,
+      int const *ptr_scatter_D_indices = nullptr
     ):
       mode(mode), 
       problem_size(problem_size), 
@@ -195,7 +215,9 @@ struct GemmUniversal {
       epilogue(epilogue), 
       ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D), 
       batch_stride_A(batch_stride_A), batch_stride_B(batch_stride_B), batch_stride_C(batch_stride_C), batch_stride_D(batch_stride_D),
-      lda(lda), ldb(ldb), ldc(ldc), ldd(ldd) {
+      lda(lda), ldb(ldb), ldc(ldc), ldd(ldd),
+      ptr_gather_A_indices(ptr_gather_A_indices), ptr_gather_B_indices(ptr_gather_B_indices),
+      ptr_scatter_D_indices(ptr_scatter_D_indices) {
       stride_a = make_Coord(lda);
       stride_b = make_Coord(ldb);
       stride_c = make_Coord(ldc);
@@ -212,6 +234,7 @@ struct GemmUniversal {
       std::swap(args.lda, args.ldb);
       std::swap(args.stride_a, args.stride_b);
       std::swap(args.batch_stride_A, args.batch_stride_B);
+      std::swap(args.ptr_gather_A_indices, args.ptr_gather_B_indices);
 
       return args;
     }
@@ -249,6 +272,10 @@ struct GemmUniversal {
     int64_t batch_stride_C;
     int64_t batch_stride_D;
 
+    int * ptr_gather_A_indices;
+    int * ptr_gather_B_indices;
+    int * ptr_scatter_D_indices;
+
     int *semaphore;
 
     //
@@ -273,6 +300,9 @@ struct GemmUniversal {
       batch_stride_B(0),
       batch_stride_C(0),
       batch_stride_D(0),
+      ptr_gather_A_indices(nullptr),
+      ptr_gather_B_indices(nullptr),
+      ptr_scatter_D_indices(nullptr),
       semaphore(nullptr) { }
 
     CUTLASS_HOST_DEVICE
@@ -301,9 +331,11 @@ struct GemmUniversal {
       batch_stride_B(args.batch_stride_B),
       batch_stride_C(args.batch_stride_C),
       batch_stride_D(args.batch_stride_D),
+      ptr_gather_A_indices(const_cast<int *>(args.ptr_gather_A_indices)),
+      ptr_gather_B_indices(const_cast<int *>(args.ptr_gather_B_indices)),
+      ptr_scatter_D_indices(const_cast<int *>(args.ptr_scatter_D_indices)),
       semaphore(static_cast<int *>(workspace)) {
 
-      CUTLASS_TRACE_HOST("GemmUniversal::Params::Params() - problem_size: " << problem_size);
     }
 
     CUTLASS_HOST_DEVICE
@@ -316,13 +348,17 @@ struct GemmUniversal {
       ptr_C = const_cast<void *>(args.ptr_C);
       ptr_D = args.ptr_D;
 
+      ptr_gather_A_indices = const_cast<int *>(args.ptr_gather_A_indices);
+      ptr_gather_B_indices = const_cast<int *>(args.ptr_gather_B_indices);
+      ptr_scatter_D_indices = const_cast<int *>(args.ptr_scatter_D_indices);
+
       batch_stride_A = args.batch_stride_A;
       batch_stride_B = args.batch_stride_B;
       batch_stride_C = args.batch_stride_C;
       batch_stride_D = args.batch_stride_D;
 
       output_op = args.epilogue;
-
+      
       semaphore = static_cast<int *>(workspace);
       CUTLASS_TRACE_HOST("GemmUniversal::Params::update()");
     }
@@ -349,27 +385,71 @@ struct GemmUniversal {
 
     CUTLASS_TRACE_HOST("GemmUniversal::can_implement()");
 
-    static int const kAlignmentA = (platform::is_same<typename Mma::IteratorA::Layout,
+    static int const kAlignmentA = (platform::is_same<LayoutA,
                                                       layout::ColumnMajorInterleaved<32>>::value)
                                    ? 32
-                                   : (platform::is_same<typename Mma::IteratorA::Layout,
+                                   : (platform::is_same<LayoutA,
                                                         layout::ColumnMajorInterleaved<64>>::value)
                                      ? 64
                                      : Mma::IteratorA::AccessType::kElements;
-    static int const kAlignmentB = (platform::is_same<typename Mma::IteratorB::Layout,
-                                                       layout::RowMajorInterleaved<32>>::value)
+    static int const kAlignmentB = (platform::is_same<LayoutB,
+                                                      layout::RowMajorInterleaved<32>>::value)
                                    ? 32
-                                   : (platform::is_same<typename Mma::IteratorB::Layout,
+                                   : (platform::is_same<LayoutB,
                                                         layout::RowMajorInterleaved<64>>::value)
                                      ? 64
                                      : Mma::IteratorB::AccessType::kElements;
-    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+    static int const kAlignmentC = (platform::is_same<LayoutC,
+                                                      layout::ColumnMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (platform::is_same<LayoutC,
+                                                        layout::ColumnMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    bool isAMisaligned = false;
+    bool isBMisaligned = false;
+    bool isCMisaligned = false;
+
+    if (platform::is_same<LayoutA, layout::RowMajor>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajor>::value) {
+      isAMisaligned = problem_size.m() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajorInterleaved<32>>::value
+            || platform::is_same<LayoutA, layout::ColumnMajorInterleaved<64>>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    }
+
+    if (platform::is_same<LayoutB, layout::RowMajor>::value) {
+      isBMisaligned = problem_size.n() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::ColumnMajor>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::RowMajorInterleaved<32>>::value
+            || platform::is_same<LayoutB, layout::RowMajorInterleaved<64>>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    }
+
+    if (platform::is_same<LayoutC, layout::RowMajor>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajor>::value) {
+      isCMisaligned = problem_size.m() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajorInterleaved<32>>::value
+            || platform::is_same<LayoutC, layout::ColumnMajorInterleaved<64>>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    }
 
-    if ((problem_size.m() % kAlignmentA) || (problem_size.k() % kAlignmentA) ||
-      (problem_size.n() % kAlignmentB) || (problem_size.k() % kAlignmentB) ||
-      (problem_size.m() % kAlignmentC) || (problem_size.n() % kAlignmentC)) {
+    if (isAMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for A operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isBMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for B operand");
+      return Status::kErrorMisalignedOperand;
+    }
 
-      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand");
+    if (isCMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for C operand");
       return Status::kErrorMisalignedOperand;
     }
 
@@ -455,14 +535,16 @@ struct GemmUniversal {
       ptr_A,
       {params.problem_size.m(), problem_size_k},
       thread_idx,
-      tb_offset_A);
+      tb_offset_A,
+      params.ptr_gather_A_indices);
 
     typename Mma::IteratorB iterator_B(
       params.params_B,
       ptr_B,
       {problem_size_k, params.problem_size.n()},
       thread_idx,
-      tb_offset_B);
+      tb_offset_B,
+      params.ptr_gather_B_indices);
 
     // Broadcast the warp_id computed by lane 0 to ensure dependent code
     // is compiled as warp-uniform.
@@ -552,7 +634,8 @@ struct GemmUniversal {
       ptr_C,
       params.problem_size.mn(),
       thread_idx,
-      threadblock_offset
+      threadblock_offset,
+      params.ptr_scatter_D_indices
     );
 
     // Tile iterator writing to destination tensor.
@@ -561,7 +644,8 @@ struct GemmUniversal {
       ptr_D,
       params.problem_size.mn(),
       thread_idx,
-      threadblock_offset
+      threadblock_offset,
+      params.ptr_scatter_D_indices
     );
 
     Epilogue epilogue(
@@ -579,7 +663,6 @@ struct GemmUniversal {
       }
 
       semaphore.wait(threadblock_tile_offset.k());
-
     }
 
 
diff --git a/include/cutlass/gemm/kernel/gemm_with_fused_epilogue.h b/include/cutlass/gemm/kernel/gemm_with_fused_epilogue.h
index 77e81fcb..264fde93 100644
--- a/include/cutlass/gemm/kernel/gemm_with_fused_epilogue.h
+++ b/include/cutlass/gemm/kernel/gemm_with_fused_epilogue.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -370,11 +376,49 @@ struct GemmWithFusedEpilogue {
     static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
     static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
 
-    if ((problem_size.m() % kAlignmentA) || (problem_size.k() % kAlignmentA) ||
-      (problem_size.n() % kAlignmentB) || (problem_size.k() % kAlignmentB) ||
-      (problem_size.m() % kAlignmentC) || (problem_size.n() % kAlignmentC)) {
+    bool isAMisaligned = false;
+    bool isBMisaligned = false;
+    bool isCMisaligned = false;
+
+    if (platform::is_same<LayoutA, layout::RowMajor>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajor>::value) {
+      isAMisaligned = problem_size.m() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajorInterleaved<32>>::value
+            || platform::is_same<LayoutA, layout::ColumnMajorInterleaved<64>>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    }
+
+    if (platform::is_same<LayoutB, layout::RowMajor>::value) {
+      isBMisaligned = problem_size.n() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::ColumnMajor>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::RowMajorInterleaved<32>>::value
+            || platform::is_same<LayoutB, layout::RowMajorInterleaved<64>>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    }
+
+    if (platform::is_same<LayoutC, layout::RowMajor>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajor>::value) {
+      isCMisaligned = problem_size.m() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajorInterleaved<32>>::value
+            || platform::is_same<LayoutC, layout::ColumnMajorInterleaved<64>>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    }
+
+    if (isAMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for A operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isBMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for B operand");
+      return Status::kErrorMisalignedOperand;
+    }
 
-      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand");
+    if (isCMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for C operand");
       return Status::kErrorMisalignedOperand;
     }
 
diff --git a/include/cutlass/gemm/kernel/gemm_with_k_reduction.h b/include/cutlass/gemm/kernel/gemm_with_k_reduction.h
index 8f1600c2..ab69d7f6 100644
--- a/include/cutlass/gemm/kernel/gemm_with_k_reduction.h
+++ b/include/cutlass/gemm/kernel/gemm_with_k_reduction.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -344,13 +350,57 @@ struct GemmWithKReduction {
                                                         layout::RowMajorInterleaved<64>>::value)
                                      ? 64
                                      : Mma::IteratorB::AccessType::kElements;
-    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+    static int const kAlignmentC =  (platform::is_same<LayoutC,
+                                                      layout::ColumnMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (platform::is_same<LayoutC,
+                                                        layout::ColumnMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    bool isAMisaligned = false;
+    bool isBMisaligned = false;
+    bool isCMisaligned = false;
+
+    if (platform::is_same<LayoutA, layout::RowMajor>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajor>::value) {
+      isAMisaligned = problem_size.m() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajorInterleaved<32>>::value
+            || platform::is_same<LayoutA, layout::ColumnMajorInterleaved<64>>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    }
 
-    if ((problem_size.m() % kAlignmentA) || (problem_size.k() % kAlignmentA) ||
-      (problem_size.n() % kAlignmentB) || (problem_size.k() % kAlignmentB) ||
-      (problem_size.m() % kAlignmentC) || (problem_size.n() % kAlignmentC)) {
+    if (platform::is_same<LayoutB, layout::RowMajor>::value) {
+      isBMisaligned = problem_size.n() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::ColumnMajor>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::RowMajorInterleaved<32>>::value
+            || platform::is_same<LayoutB, layout::RowMajorInterleaved<64>>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    }
+
+    if (platform::is_same<LayoutC, layout::RowMajor>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajor>::value) {
+      isCMisaligned = problem_size.m() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajorInterleaved<32>>::value
+            || platform::is_same<LayoutC, layout::ColumnMajorInterleaved<64>>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    }
+
+    if (isAMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for operand A");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isBMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for operand B");
+      return Status::kErrorMisalignedOperand;
+    }
 
-      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand");
+    if (isCMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for operand C");
       return Status::kErrorMisalignedOperand;
     }
 
diff --git a/include/cutlass/gemm/kernel/gemv.h b/include/cutlass/gemm/kernel/gemv.h
index 04aa305b..2d45fbb5 100644
--- a/include/cutlass/gemm/kernel/gemv.h
+++ b/include/cutlass/gemm/kernel/gemv.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/kernel/gemv_batched_strided.h b/include/cutlass/gemm/kernel/gemv_batched_strided.h
index d20b50ed..393c41ae 100755
--- a/include/cutlass/gemm/kernel/gemv_batched_strided.h
+++ b/include/cutlass/gemm/kernel/gemv_batched_strided.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/kernel/rank_2k_universal.h b/include/cutlass/gemm/kernel/rank_2k_universal.h
new file mode 100644
index 00000000..0346cbbf
--- /dev/null
+++ b/include/cutlass/gemm/kernel/rank_2k_universal.h
@@ -0,0 +1,778 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/semaphore.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma1_,                 ///! Threadblock-scoped matrix multiply-accumulate (A*B^T)
+  typename Mma2_,                 ///! Threadblock-scoped matrix multiply-accumulate (B*A^T)
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_,   ///! Threadblock swizzling function
+  FillMode FillModeC_,            ///! Fill Mode for C (kLower or kUpper)
+  BlasMode BlasMode_              ///! Blas3 computation mode
+>
+struct Rank2KUniversal {
+public:
+
+  using Mma1 = Mma1_;
+  using Mma2 = Mma2_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  using ElementA = typename Mma1::IteratorA::Element;
+  using ElementB = typename Mma1::IteratorB::Element;
+
+  // Mma1 (A x B^T)
+  using LayoutA = typename Mma1::IteratorA::Layout;
+  using LayoutBT = typename Mma1::IteratorB::Layout;
+  static ComplexTransform const kMma1TransformA = Mma1::kTransformA;
+  static ComplexTransform const kMma1TransformB = Mma1::kTransformB;
+
+  // Mma2 (B x A^T)
+  using LayoutB = typename Mma2::IteratorA::Layout;
+  using LayoutAT = typename Mma2::IteratorB::Layout;
+  static ComplexTransform const kMma2TransformA = Mma2::kTransformA;
+  static ComplexTransform const kMma2TransformB = Mma2::kTransformB;
+
+  // Common type definitions for Mma1 and Mma2
+  using Operator = typename Mma1::Operator;
+  using OperatorClass = typename Mma1::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma1::Shape;
+  using WarpShape = typename Mma1::Operator::Shape;
+  using InstructionShape = typename Mma1::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma1::ArchTag;
+
+  static int const kStages = Mma1::kStages;
+  static int const kAlignmentA = Mma1::IteratorA::AccessType::kElements;
+  static int const kAlignmentB = Mma1::IteratorB::AccessType::kElements;
+
+  // Output related typedefinitions
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+  static FillMode const kFillModeC = FillModeC_;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+  static BlasMode const kBlasMode = BlasMode_;
+
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma1::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmUniversalMode mode;
+    GemmCoord problem_size;
+    int batch_count;
+
+    typename EpilogueOutputOp::Params epilogue;
+
+    void const * ptr_A;
+    void const * ptr_B;
+    void const * ptr_C;
+    void * ptr_D;
+
+    int64_t batch_stride_A;
+    int64_t batch_stride_B;
+    int64_t batch_stride_C;
+    int64_t batch_stride_D;
+
+    typename LayoutA::Stride::Index lda;
+    typename LayoutB::Stride::Index ldb;
+    typename LayoutC::Stride::Index ldc;
+    typename LayoutC::Stride::Index ldd;
+
+    //
+    // Methods
+    //
+    
+    Arguments(): 
+      mode(GemmUniversalMode::kGemm), 
+      batch_count(1), 
+      ptr_A(nullptr), ptr_B(nullptr), ptr_C(nullptr), ptr_D(nullptr) { }
+
+    /// constructs an arguments structure
+    Arguments(
+      GemmUniversalMode mode,
+      GemmCoord problem_size,
+      int batch_count,
+      typename EpilogueOutputOp::Params epilogue,
+      void const * ptr_A,
+      void const * ptr_B,
+      void const * ptr_C,
+      void * ptr_D,
+      int64_t batch_stride_A,
+      int64_t batch_stride_B,
+      int64_t batch_stride_C,
+      int64_t batch_stride_D,
+      typename LayoutA::Stride::Index lda,
+      typename LayoutB::Stride::Index ldb,
+      typename LayoutC::Stride::Index ldc,
+      typename LayoutC::Stride::Index ldd
+    ):
+      mode(mode), 
+      problem_size(problem_size), 
+      batch_count(batch_count),
+      epilogue(epilogue), 
+      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D), 
+      batch_stride_A(batch_stride_A), batch_stride_C(batch_stride_C), batch_stride_D(batch_stride_D), 
+      lda(lda), ldb(ldb), ldc(ldc), ldd(ldd) {
+
+      }
+
+      /// Returns arguments for a the transposed problem
+      Arguments transposed_problem() const {
+        Arguments args(*this);
+        
+        std::swap(args.ptr_A, args.ptr_B);
+        std::swap(args.lda, args.ldb);
+        std::swap(args.batch_stride_A, args.batch_stride_B);
+
+        return args;
+      }
+
+  };
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params {
+
+    cutlass::gemm::GemmCoord problem_size;
+    cutlass::gemm::GemmCoord grid_tiled_shape;
+    int swizzle_log_tile;
+    
+    // Mma1 Iterator A and B params
+    typename Mma1::IteratorA::Params params_A;
+    typename Mma1::IteratorB::Params params_BT;
+
+    // Mma2 Iterator A and B params 
+    typename Mma2::IteratorA::Params params_B;
+    typename Mma2::IteratorB::Params params_AT;
+
+    typename Epilogue::OutputTileIterator::Params params_C;
+    typename Epilogue::OutputTileIterator::Params params_D;
+    
+    typename EpilogueOutputOp::Params output_op;
+
+    GemmUniversalMode mode;
+    int batch_count;
+    int gemm_k_size;
+
+    void * ptr_A;
+    void * ptr_B;
+    void * ptr_C;
+    void * ptr_D;
+
+    int64_t batch_stride_A;
+    int64_t batch_stride_B;
+    int64_t batch_stride_C;
+    int64_t batch_stride_D;
+
+    int *semaphore;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params():
+      swizzle_log_tile(0),
+      params_A(0),
+      params_BT(0),
+      params_B(0),
+      params_AT(0),
+      params_C(0),
+      params_D(0),
+      batch_count(0),
+      gemm_k_size(0),
+      mode(cutlass::gemm::GemmUniversalMode::kGemm),
+      ptr_A(nullptr),
+      ptr_B(nullptr),
+      ptr_C(nullptr),
+      ptr_D(nullptr),
+      batch_stride_A(0),
+      batch_stride_B(0),
+      batch_stride_C(0),
+      batch_stride_D(0),
+      semaphore(nullptr) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      Arguments const &args,
+      cutlass::gemm::GemmCoord const & grid_tiled_shape,
+      int gemm_k_size,
+      void *workspace = nullptr
+    ):
+      problem_size(args.problem_size),
+      grid_tiled_shape(grid_tiled_shape),
+      swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
+      params_A(args.lda),
+      params_BT(args.ldb),
+      params_B(args.ldb),
+      params_AT(args.lda),
+      params_C(args.ldc),
+      params_D(args.ldd),
+      output_op(args.epilogue),
+      mode(args.mode),
+      batch_count(args.batch_count),
+      gemm_k_size(gemm_k_size),
+      ptr_A(const_cast<void *>(args.ptr_A)),
+      ptr_B(const_cast<void *>(args.ptr_B)),
+      ptr_C(const_cast<void *>(args.ptr_C)),
+      ptr_D(const_cast<void *>(args.ptr_D)),
+      batch_stride_A(args.batch_stride_A),
+      batch_stride_B(args.batch_stride_B),
+      batch_stride_C(args.batch_stride_C),
+      batch_stride_D(args.batch_stride_D),
+      semaphore(static_cast<int *>(workspace)) {
+    }
+
+    CUTLASS_HOST_DEVICE
+    void update(
+      Arguments const &args,
+      void *workspace = nullptr) {
+
+      ptr_A = const_cast<void *>(args.ptr_A);
+      ptr_B = const_cast<void *>(args.ptr_B);
+      ptr_C = const_cast<void *>(args.ptr_C);
+      ptr_D = args.ptr_D;
+
+      output_op = args.epilogue;
+
+      semaphore = static_cast<int *>(workspace);
+    }
+
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma1::SharedStorage mma1_main_loop;
+    typename Mma2::SharedStorage mma2_main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+public:
+
+  //
+  // Methods
+  //
+
+  CUTLASS_DEVICE
+  Rank2KUniversal() { } 
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(
+    cutlass::gemm::GemmCoord const & problem_size) {
+
+    static int const kAlignmentA = Mma1::IteratorA::AccessType::kElements;
+    static int const kAlignmentB = Mma1::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    if ((problem_size.m() % kAlignmentA) || (problem_size.k() % kAlignmentA) ||
+      (problem_size.n() % kAlignmentB) || (problem_size.k() % kAlignmentB) ||
+      (problem_size.m() % kAlignmentC) || (problem_size.n() % kAlignmentC)) {
+
+      return Status::kErrorMisalignedOperand;
+    }
+
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const &args) {
+    return can_implement(args.problem_size);
+  }
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+      return;
+    }
+   
+    // Early exit if Fill Mode is Lower and
+    // if the entire tile is above the main diagonal (bottom-left corner is at or above the diagonal)
+    if (kFillModeC == cutlass::FillMode::kLower &&
+        (threadblock_tile_offset.m() + 1) * Mma1::Shape::kM <= threadblock_tile_offset.n() * Mma1::Shape::kN) {
+      return;
+    }    
+    
+    // Early exit if Fill Mode is Upper and
+    // if the entire tile is below the main diagonal (top-right corner is at or below the diagonal)
+    if (kFillModeC == cutlass::FillMode::kUpper &&
+        threadblock_tile_offset.m() * Mma1::Shape::kM >= (threadblock_tile_offset.n() + 1) * Mma1::Shape::kN) {
+      return;
+    }    
+    
+    bool tile_on_diagonal = false;
+    // Mark tiles that are being crossed by the main diagonal
+    // (top-right and bottom-left corners are on either side of the diagonal)
+    if ((threadblock_tile_offset.m() + 1) * Mma1::Shape::kM > threadblock_tile_offset.n() * Mma1::Shape::kN
+        && threadblock_tile_offset.m() * Mma1::Shape::kM < (threadblock_tile_offset.n() + 1) * Mma1::Shape::kN) {
+      tile_on_diagonal = true;
+    }
+
+    int offset_k = 0;
+    int problem_size_k = params.problem_size.k();
+
+    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A); 
+    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
+
+    //
+    // Fetch pointers based on mode.
+    //
+    if (params.mode == GemmUniversalMode::kGemm || 
+      params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+
+      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
+
+        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size; 
+      }
+
+      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
+    }
+
+    __syncthreads();
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_MxK{
+      threadblock_tile_offset.m() * Mma1::Shape::kM,
+      offset_k,
+    };
+
+    cutlass::MatrixCoord tb_offset_KxN{
+      offset_k,
+      threadblock_tile_offset.n() * Mma1::Shape::kN
+    };
+
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A and B operands for Mma1
+    typename Mma1::IteratorA iterator_A(
+      params.params_A,
+      ptr_A,
+      {params.problem_size.m(), problem_size_k},
+      thread_idx,
+      tb_offset_MxK);
+
+    typename Mma1::IteratorB iterator_BT(
+      params.params_BT,
+      ptr_B,
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_KxN);
+
+    // Construct iterators to A and B operands for Mma2
+    typename Mma2::IteratorA iterator_B(
+      params.params_B,
+      ptr_B,
+      {params.problem_size.m(), problem_size_k},
+      thread_idx,
+      tb_offset_MxK);
+
+    typename Mma2::IteratorB iterator_AT(
+      params.params_AT,
+      ptr_A,
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_KxN);
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply for Mma1 (A x BT)
+    Mma1 mma1(shared_storage.mma1_main_loop, thread_idx, warp_idx, lane_idx);
+
+    // Construct thread-scoped matrix multiply for Mma2 (B x AT)
+    Mma2 mma2(shared_storage.mma2_main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma1::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - offset_k + Mma1::Shape::kK - 1) / Mma1::Shape::kK;
+
+    // Compute threadblock-scoped matrix multiply-add (A x BT)
+    mma1(
+      gemm_k_iterations, 
+      accumulators, 
+      iterator_A, 
+      iterator_BT, 
+      accumulators);
+
+    // HER2K kernel needs Alpha to be complex and is conj(Alpha) is applied to the second HERK.
+    if (kBlasMode == BlasMode::kHermitian) {
+
+      //
+      // Epilogue
+      //
+
+      EpilogueOutputOp output_op(params.output_op);
+
+      //
+      // Masked tile iterators constructed from members
+      //
+
+      threadblock_tile_offset =
+          threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+      //assume identity swizzle
+      MatrixCoord threadblock_offset(
+        threadblock_tile_offset.m() * Mma1::Shape::kM,
+        threadblock_tile_offset.n() * Mma1::Shape::kN
+      );
+
+      int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+      ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C); 
+      ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
+
+      //
+      // Fetch pointers based on mode.
+      //
+      
+      // Construct the semaphore.
+      Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+
+      if (params.mode == GemmUniversalMode::kGemm) {
+
+        // If performing a reduction via split-K, fetch the initial synchronization
+        if (params.grid_tiled_shape.k() > 1) {
+          
+          // Fetch the synchronization lock initially but do not block.
+          semaphore.fetch();
+
+          // Indicate which position in a serial reduction the output operator is currently updating
+          output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+        }
+      }
+      else if (params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+        ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+      }
+      else if (params.mode == GemmUniversalMode::kBatched) {
+        ptr_C += threadblock_tile_offset.k() * params.batch_stride_C;
+        ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+      }
+      else if (params.mode == GemmUniversalMode::kArray) {
+        ptr_C = static_cast<ElementC * const *>(params.ptr_C)[threadblock_tile_offset.k()];
+        ptr_D = static_cast<ElementC * const *>(params.ptr_D)[threadblock_tile_offset.k()];
+      }
+
+      
+      // If CTA not on diagonal, FillMode doesn't apply. 
+      FillMode kFillModeCTA = tile_on_diagonal ? kFillModeC : FillMode::kNone;
+
+      // Tile iterator loading from source tensor.
+      typename Epilogue::OutputTileIterator iterator_C(
+        params.params_C,
+        ptr_C,
+        params.problem_size.mn(),
+        thread_idx,
+        threadblock_offset,
+        kFillModeCTA
+      );
+
+      // Tile iterator writing to destination tensor.
+      typename Epilogue::OutputTileIterator iterator_D(
+        params.params_D,
+        ptr_D,
+        params.problem_size.mn(),
+        thread_idx,
+        threadblock_offset,
+        kFillModeCTA
+      );
+
+      Epilogue epilogue(
+        shared_storage.epilogue, 
+        thread_idx, 
+        warp_idx, 
+        lane_idx);
+
+      // Wait on the semaphore - this latency may have been covered by iterator construction
+      if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
+          
+        // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+        if (threadblock_tile_offset.k()) {
+          iterator_C = iterator_D;
+        }
+
+        semaphore.wait(threadblock_tile_offset.k());
+
+        __threadfence();
+      }
+
+      // Execute the epilogue operator to update the destination tensor.
+      epilogue(
+        output_op, 
+        iterator_D, 
+        accumulators, 
+        iterator_C); 
+      
+      //
+      // Release the semaphore
+      //
+
+      if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) { 
+
+        int lock = 0;
+        if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
+
+          // The final threadblock resets the semaphore for subsequent grids.
+          lock = 0;
+        }
+        else {
+          // Otherwise, the semaphore is incremented
+          lock = threadblock_tile_offset.k() + 1;
+        }
+        
+        semaphore.release(lock);
+      }
+
+      __syncthreads();
+
+      accumulators.clear();
+    }
+
+    // Compute threadblock-scoped matrix multiply-add (B x AT)
+    mma2(
+      gemm_k_iterations, 
+      accumulators, 
+      iterator_B, 
+      iterator_AT, 
+      accumulators);
+
+    //
+    // Epilogue
+    //
+
+    EpilogueOutputOp output_op(params.output_op);
+
+    /* Needed for HER2K where the second HERK is multiplied by conj(alpha) */
+    typename EpilogueOutputOp::Params second_her2k_params(conj(params.output_op.alpha), 1);
+    EpilogueOutputOp output_op_her2k(second_her2k_params);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    //assume identity swizzle
+    MatrixCoord threadblock_offset(
+      threadblock_tile_offset.m() * Mma1::Shape::kM,
+      threadblock_tile_offset.n() * Mma1::Shape::kN
+    );
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C);
+
+    // HER2K kernel needs Alpha to be complex and is conj(Alpha) is applied to the second HERK.
+    if (kBlasMode == BlasMode::kHermitian) {
+      ptr_C = static_cast<ElementC *>(params.ptr_D);
+    }
+
+    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
+
+    //
+    // Fetch pointers based on mode.
+    //
+    
+    // Construct the semaphore.
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+
+    if (params.mode == GemmUniversalMode::kGemm) {
+
+      // If performing a reduction via split-K, fetch the initial synchronization
+      if (params.grid_tiled_shape.k() > 1) {
+        
+        // Fetch the synchronization lock initially but do not block.
+        semaphore.fetch();
+
+        // Indicate which position in a serial reduction the output operator is currently updating
+        if (kBlasMode == BlasMode::kSymmetric) {
+          output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+        } else {
+          output_op_her2k.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+        }
+      }
+    }
+    else if (params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_C += threadblock_tile_offset.k() * params.batch_stride_C;
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_C = static_cast<ElementC * const *>(params.ptr_C)[threadblock_tile_offset.k()];
+      ptr_D = static_cast<ElementC * const *>(params.ptr_D)[threadblock_tile_offset.k()];
+    }
+
+    
+    // If CTA not on diagonal, FillMode doesn't apply. 
+    FillMode kFillModeCTA = tile_on_diagonal ? kFillModeC : FillMode::kNone;
+
+    // Tile iterator loading from source tensor.
+    typename Epilogue::OutputTileIterator iterator_C(
+      params.params_C,
+      ptr_C,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset,
+      kFillModeCTA
+    );
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.params_D,
+      ptr_D,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset,
+      kFillModeCTA
+    );
+
+    Epilogue epilogue(
+      shared_storage.epilogue, 
+      thread_idx, 
+      warp_idx, 
+      lane_idx);
+
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
+        
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_offset.k()) {
+        iterator_C = iterator_D;
+      }
+
+      semaphore.wait(threadblock_tile_offset.k());
+
+      __threadfence();
+    }
+
+    // Execute the epilogue operator to update the destination tensor.
+    if (kBlasMode == BlasMode::kSymmetric) {
+      epilogue(
+        output_op,
+        iterator_D,
+        accumulators,
+        iterator_C);
+    } else {
+      epilogue(
+        output_op_her2k,
+        iterator_D,
+        accumulators,
+        iterator_C);
+    }
+    
+    //
+    // Release the semaphore
+    //
+
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) { 
+
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_offset.k() + 1;
+      }
+      
+      semaphore.release(lock);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/kernel/rank_k_universal.h b/include/cutlass/gemm/kernel/rank_k_universal.h
new file mode 100644
index 00000000..41f53898
--- /dev/null
+++ b/include/cutlass/gemm/kernel/rank_k_universal.h
@@ -0,0 +1,565 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/semaphore.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate 
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_,   ///! Threadblock swizzling function
+  FillMode FillModeC_             ///! Fill Mode for C (kLower or kUpper)
+>
+struct RankKUniversal {
+public:
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+  static FillMode const kFillModeC = FillModeC_;
+
+  static ComplexTransform const kTransformA = Mma::kTransformA;
+  static ComplexTransform const kTransformB = Mma::kTransformB;
+  using Operator = typename Mma::Operator;
+
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma::ArchTag;
+
+  static int const kStages = Mma::kStages;
+  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Split-K preserves splits that are 128b aligned
+  static int const kSplitKAlignment = 128 / sizeof_bits<ElementA>::value;
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmUniversalMode mode;
+    GemmCoord problem_size;
+    int batch_count;
+
+    typename EpilogueOutputOp::Params epilogue;
+
+    void const * ptr_A;
+    void const * ptr_C;
+    void * ptr_D;
+
+    int64_t batch_stride_A;
+    int64_t batch_stride_C;
+    int64_t batch_stride_D;
+
+    typename LayoutA::Stride::Index lda;
+    typename LayoutB::Stride::Index ldb;
+    typename LayoutC::Stride::Index ldc;
+    typename LayoutC::Stride::Index ldd;
+
+    //
+    // Methods
+    //
+    
+    Arguments(): 
+      mode(GemmUniversalMode::kGemm), 
+      batch_count(1), 
+      ptr_A(nullptr), ptr_C(nullptr), ptr_D(nullptr) { }
+
+    /// constructs an arguments structure
+    Arguments(
+      GemmUniversalMode mode,
+      GemmCoord problem_size,
+      int batch_count,
+      typename EpilogueOutputOp::Params epilogue,
+      void const * ptr_A,
+      void const * ptr_C,
+      void * ptr_D,
+      int64_t batch_stride_A,
+      int64_t batch_stride_C,
+      int64_t batch_stride_D,
+      typename LayoutA::Stride::Index lda,
+      typename LayoutC::Stride::Index ldc,
+      typename LayoutC::Stride::Index ldd
+    ):
+      mode(mode), 
+      problem_size(problem_size), 
+      batch_count(batch_count),
+      epilogue(epilogue), 
+      ptr_A(ptr_A), ptr_C(ptr_C), ptr_D(ptr_D), 
+      batch_stride_A(batch_stride_A), batch_stride_C(batch_stride_C), batch_stride_D(batch_stride_D), 
+      lda(lda), ldb(ldb), ldc(ldc), ldd(ldd) {
+
+      }
+
+  };
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params {
+
+    cutlass::gemm::GemmCoord problem_size;
+    cutlass::gemm::GemmCoord grid_tiled_shape;
+    int swizzle_log_tile;
+   
+    typename Mma::IteratorA::Params params_A;
+    typename Mma::IteratorB::Params params_B;
+    typename Epilogue::OutputTileIterator::Params params_C;
+    typename Epilogue::OutputTileIterator::Params params_D;
+    
+    typename EpilogueOutputOp::Params output_op;
+
+    GemmUniversalMode mode;
+    int batch_count;
+    int gemm_k_size;
+
+    void * ptr_A;
+    void * ptr_B;
+    void * ptr_C;
+    void * ptr_D;
+
+    int64_t batch_stride_A;
+    int64_t batch_stride_B;
+    int64_t batch_stride_C;
+    int64_t batch_stride_D;
+
+    int *semaphore;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params():
+      swizzle_log_tile(0),
+      params_A(0),
+      params_B(0),
+      params_C(0),
+      params_D(0),
+      batch_count(0),
+      gemm_k_size(0),
+      mode(cutlass::gemm::GemmUniversalMode::kGemm),
+      ptr_A(nullptr),
+      ptr_B(nullptr),
+      ptr_C(nullptr),
+      ptr_D(nullptr),
+      batch_stride_A(0),
+      batch_stride_B(0),
+      batch_stride_C(0),
+      batch_stride_D(0),
+      semaphore(nullptr) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      Arguments const &args,
+      cutlass::gemm::GemmCoord const & grid_tiled_shape,
+      int gemm_k_size,
+      void *workspace = nullptr
+    ):
+      problem_size(args.problem_size),
+      grid_tiled_shape(grid_tiled_shape),
+      swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
+      params_A(args.lda),
+      params_B(args.lda),
+      params_C(args.ldc),
+      params_D(args.ldd),
+      output_op(args.epilogue),
+      mode(args.mode),
+      batch_count(args.batch_count),
+      gemm_k_size(gemm_k_size),
+      ptr_A(const_cast<void *>(args.ptr_A)),
+      ptr_B(const_cast<void *>(args.ptr_A)),
+      ptr_C(const_cast<void *>(args.ptr_C)),
+      ptr_D(const_cast<void *>(args.ptr_D)),
+      batch_stride_A(args.batch_stride_A),
+      batch_stride_B(args.batch_stride_A),
+      batch_stride_C(args.batch_stride_C),
+      batch_stride_D(args.batch_stride_D),
+      semaphore(static_cast<int *>(workspace)) {
+    }
+
+    CUTLASS_HOST_DEVICE
+    void update(
+      Arguments const &args,
+      void *workspace = nullptr) {
+
+      ptr_A = const_cast<void *>(args.ptr_A);
+      ptr_B = const_cast<void *>(args.ptr_A);
+      ptr_C = const_cast<void *>(args.ptr_C);
+      ptr_D = args.ptr_D;
+
+      output_op = args.epilogue;
+
+      semaphore = static_cast<int *>(workspace);
+    }
+
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+public:
+
+  //
+  // Methods
+  //
+
+  CUTLASS_DEVICE
+  RankKUniversal() { } 
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(
+    cutlass::gemm::GemmCoord const & problem_size) {
+
+    static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    if ((problem_size.m() % kAlignmentA) || (problem_size.k() % kAlignmentA) ||
+      (problem_size.n() % kAlignmentB) || (problem_size.k() % kAlignmentB) ||
+      (problem_size.m() % kAlignmentC) || (problem_size.n() % kAlignmentC)) {
+
+      return Status::kErrorMisalignedOperand;
+    }
+
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const &args) {
+    return can_implement(args.problem_size);
+  }
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+      return;
+    }
+   
+    // Early exit if Fill Mode is Lower and
+    // if the entire tile is above the main diagonal (bottom-left corner is at or above the diagonal)
+    if (kFillModeC == cutlass::FillMode::kLower &&
+        (threadblock_tile_offset.m() + 1) * Mma::Shape::kM <= threadblock_tile_offset.n() * Mma::Shape::kN) {
+      return;
+    }    
+    
+    // Early exit if Fill Mode is Upper and
+    // if the entire tile is below the main diagonal (top-right corner is at or below the diagonal)
+    if (kFillModeC == cutlass::FillMode::kUpper &&
+        threadblock_tile_offset.m() * Mma::Shape::kM >= (threadblock_tile_offset.n() + 1) * Mma::Shape::kN) {
+      return;
+    }    
+    
+    bool tile_on_diagonal = false;
+    // Mark tiles that are being crossed by the main diagonal
+    // (top-right and bottom-left corners are on either side of the diagonal)
+    if ((threadblock_tile_offset.m() + 1) * Mma::Shape::kM > threadblock_tile_offset.n() * Mma::Shape::kN
+        && threadblock_tile_offset.m() * Mma::Shape::kM < (threadblock_tile_offset.n() + 1) * Mma::Shape::kN) {
+      tile_on_diagonal = true;
+    }
+
+    int offset_k = 0;
+    int problem_size_k = params.problem_size.k();
+
+    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A); 
+    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
+
+    //
+    // Fetch pointers based on mode.
+    //
+    if (params.mode == GemmUniversalMode::kGemm || 
+      params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+
+      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
+
+        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size; 
+      }
+
+      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_A += threadblock_tile_offset.k() * params.batch_stride_A;
+      ptr_B += threadblock_tile_offset.k() * params.batch_stride_B;
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_A = static_cast<ElementA * const *>(params.ptr_A)[threadblock_tile_offset.k()];
+      ptr_B = static_cast<ElementB * const *>(params.ptr_B)[threadblock_tile_offset.k()];
+    }
+
+    __syncthreads();
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_A{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      offset_k,
+    };
+
+    cutlass::MatrixCoord tb_offset_B{
+      offset_k,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    };
+
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A and B operands
+    typename Mma::IteratorA iterator_A(
+      params.params_A,
+      ptr_A,
+      {params.problem_size.m(), problem_size_k},
+      thread_idx,
+      tb_offset_A);
+
+    typename Mma::IteratorB iterator_B(
+      params.params_B,
+      ptr_B,
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_B);
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - offset_k + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+    // Compute threadblock-scoped matrix multiply-add
+    mma(
+      gemm_k_iterations, 
+      accumulators, 
+      iterator_A, 
+      iterator_B, 
+      accumulators);
+
+    //
+    // Epilogue
+    //
+
+    EpilogueOutputOp output_op(params.output_op);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    //assume identity swizzle
+    MatrixCoord threadblock_offset(
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    );
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C); 
+    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
+
+    //
+    // Fetch pointers based on mode.
+    //
+    
+    // Construct the semaphore.
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+
+    if (params.mode == GemmUniversalMode::kGemm) {
+
+      // If performing a reduction via split-K, fetch the initial synchronization
+      if (params.grid_tiled_shape.k() > 1) {
+        
+        // Fetch the synchronization lock initially but do not block.
+        semaphore.fetch();
+
+        // Indicate which position in a serial reduction the output operator is currently updating
+        output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+      }
+    }
+    else if (params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_C += threadblock_tile_offset.k() * params.batch_stride_C;
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_C = static_cast<ElementC * const *>(params.ptr_C)[threadblock_tile_offset.k()];
+      ptr_D = static_cast<ElementC * const *>(params.ptr_D)[threadblock_tile_offset.k()];
+    }
+
+    
+    // If CTA not on diagonal, FillMode doesn't apply. 
+    FillMode kFillModeCTA = tile_on_diagonal ? kFillModeC : FillMode::kNone;
+
+    // Tile iterator loading from source tensor.
+    typename Epilogue::OutputTileIterator iterator_C(
+      params.params_C,
+      ptr_C,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset,
+      kFillModeCTA
+    );
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.params_D,
+      ptr_D,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset,
+      kFillModeCTA
+    );
+
+    Epilogue epilogue(
+      shared_storage.epilogue, 
+      thread_idx, 
+      warp_idx, 
+      lane_idx);
+
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
+        
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_offset.k()) {
+        iterator_C = iterator_D;
+      }
+
+      semaphore.wait(threadblock_tile_offset.k());
+
+      __threadfence();
+    }
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(
+      output_op, 
+      iterator_D, 
+      accumulators, 
+      iterator_C); 
+    
+    //
+    // Release the semaphore
+    //
+
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) { 
+
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_offset.k() + 1;
+      }
+      
+      semaphore.release(lock);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/kernel/sparse_gemm.h b/include/cutlass/gemm/kernel/sparse_gemm.h
index 8e765464..849ed08c 100644
--- a/include/cutlass/gemm/kernel/sparse_gemm.h
+++ b/include/cutlass/gemm/kernel/sparse_gemm.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/kernel/symm_universal.h b/include/cutlass/gemm/kernel/symm_universal.h
new file mode 100755
index 00000000..4e25e6be
--- /dev/null
+++ b/include/cutlass/gemm/kernel/symm_universal.h
@@ -0,0 +1,698 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/semaphore.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma1_,                 ///! Threadblock-scoped triangular matrix multiply-accumulate (A*B or B*A)
+  typename Mma2_,                 ///! Threadblock-scoped triangular matrix multiply-accumulate (AT*B or B*AT)
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_,   ///! Threadblock swizzling function
+  SideMode SideMode_,             ///! Side Mode for the kernel (kLeft or kRight)
+  FillMode FillMode_              ///! Fill Mode for triangular matrix (kLower or kUpper)
+>
+struct SymmUniversal {
+public:
+
+  using Mma1 = Mma1_;
+  using Mma2 = Mma2_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  using ElementA = typename Mma1::IteratorA::Element;
+  using ElementB = typename Mma1::IteratorB::Element;
+
+  // Mma1 (TRMM - with diagonal: C_tmp = alpha * A * B)
+  using LayoutA = typename Mma1::IteratorA::Layout;
+  using LayoutBT = typename Mma1::IteratorB::Layout;
+  static ComplexTransform const kMma1TransformA = Mma1::kTransformA;
+  static ComplexTransform const kMma1TransformB = Mma1::kTransformB;
+
+  // Mma2 (TRMM - withOUT diagonal: alpha * AT * B)
+  using LayoutB = typename Mma2::IteratorA::Layout;
+  using LayoutAT = typename Mma2::IteratorB::Layout;
+  static ComplexTransform const kMma2TransformA = Mma2::kTransformA;
+  static ComplexTransform const kMma2TransformB = Mma2::kTransformB;
+
+  // Common type definitions for Mma1 and Mma2
+  using Operator = typename Mma1::Operator;
+  using OperatorClass = typename Mma1::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma1::Shape;
+  using WarpShape = typename Mma1::Operator::Shape;
+  using InstructionShape = typename Mma1::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma1::ArchTag;
+
+  static int const kStages = Mma1::kStages;
+  static int const kAlignmentA = Mma1::IteratorA::AccessType::kElements;
+  static int const kAlignmentB = Mma1::IteratorB::AccessType::kElements;
+
+  // Output related typedefinitions
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+  static SideMode const kSideModeA = SideMode_;
+  static FillMode const kFillModeA = FillMode_;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma1::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmUniversalMode mode;
+    GemmCoord problem_size;
+    int batch_count;
+
+    typename EpilogueOutputOp::Params epilogue;
+
+    void const * ptr_A;
+    void const * ptr_B;
+    void const * ptr_C;
+    void * ptr_D;
+
+    int64_t batch_stride_A;
+    int64_t batch_stride_B;
+    int64_t batch_stride_C;
+    int64_t batch_stride_D;
+
+    typename LayoutA::Stride::Index lda;
+    typename LayoutB::Stride::Index ldb;
+    typename LayoutC::Stride::Index ldc;
+    typename LayoutC::Stride::Index ldd;
+
+    //
+    // Methods
+    //
+    
+    Arguments(): 
+      mode(GemmUniversalMode::kGemm), 
+      batch_count(1), 
+      ptr_A(nullptr), ptr_B(nullptr), ptr_C(nullptr), ptr_D(nullptr) { }
+
+    /// constructs an arguments structure
+    Arguments(
+      GemmUniversalMode mode,
+      GemmCoord problem_size,
+      int batch_count,
+      typename EpilogueOutputOp::Params epilogue,
+      void const * ptr_A,
+      void const * ptr_B,
+      void const * ptr_C,
+      void * ptr_D,
+      int64_t batch_stride_A,
+      int64_t batch_stride_B,
+      int64_t batch_stride_C,
+      int64_t batch_stride_D,
+      typename LayoutA::Stride::Index lda,
+      typename LayoutB::Stride::Index ldb,
+      typename LayoutC::Stride::Index ldc,
+      typename LayoutC::Stride::Index ldd
+    ):
+      mode(mode), 
+      problem_size(problem_size), 
+      batch_count(batch_count),
+      epilogue(epilogue), 
+      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D), 
+      batch_stride_A(batch_stride_A), batch_stride_C(batch_stride_C), batch_stride_D(batch_stride_D), 
+      lda(lda), ldb(ldb), ldc(ldc), ldd(ldd) {
+
+      }
+
+    /// Returns arguments for the transposed problem sizes
+    Arguments transposed_problem_size() const {
+      Arguments args(*this);
+
+      std::swap(args.problem_size.m(), args.problem_size.n());
+
+      return args;
+    }
+
+    /// Returns arguments for the transposed matrices
+    Arguments swapped_matrices() const {
+      Arguments args(*this);
+
+      std::swap(args.ptr_A, args.ptr_B);
+      std::swap(args.lda, args.ldb);
+      std::swap(args.batch_stride_A, args.batch_stride_B);
+
+      return args;
+    }
+  };
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params {
+
+    cutlass::gemm::GemmCoord problem_size;
+    cutlass::gemm::GemmCoord grid_tiled_shape;
+    int swizzle_log_tile;
+    
+    // Mma1 Iterator A and B params
+    typename Mma1::IteratorA::Params params_A_mma1;
+    typename Mma1::IteratorB::Params params_B_mma1;
+
+    // Mma2 Iterator A and B params 
+    typename Mma2::IteratorA::Params params_A_mma2;
+    typename Mma2::IteratorB::Params params_B_mma2;
+
+    typename Epilogue::OutputTileIterator::Params params_C;
+    typename Epilogue::OutputTileIterator::Params params_D;
+    
+    typename EpilogueOutputOp::Params output_op;
+
+    GemmUniversalMode mode;
+    int batch_count;
+    int gemm_k_size;
+
+    void * ptr_A;
+    void * ptr_B;
+    void * ptr_C;
+    void * ptr_D;
+
+    int64_t batch_stride_A;
+    int64_t batch_stride_B;
+    int64_t batch_stride_C;
+    int64_t batch_stride_D;
+
+    int *semaphore;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params():
+      swizzle_log_tile(0),
+      params_A_mma1(0),
+      params_B_mma1(0),
+      params_A_mma2(0),
+      params_B_mma2(0),
+      params_C(0),
+      params_D(0),
+      batch_count(0),
+      gemm_k_size(0),
+      mode(cutlass::gemm::GemmUniversalMode::kGemm),
+      ptr_A(nullptr),
+      ptr_B(nullptr),
+      ptr_C(nullptr),
+      ptr_D(nullptr),
+      batch_stride_A(0),
+      batch_stride_B(0),
+      batch_stride_C(0),
+      batch_stride_D(0),
+      semaphore(nullptr) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      Arguments const &args,
+      cutlass::gemm::GemmCoord const & grid_tiled_shape,
+      int gemm_k_size,
+      void *workspace = nullptr
+    ):
+      problem_size(args.problem_size),
+      grid_tiled_shape(grid_tiled_shape),
+      swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
+      params_A_mma1(args.lda),
+      params_B_mma1(args.ldb),
+      params_A_mma2(args.lda),
+      params_B_mma2(args.ldb),
+      params_C(args.ldc),
+      params_D(args.ldd),
+      output_op(args.epilogue),
+      mode(args.mode),
+      batch_count(args.batch_count),
+      gemm_k_size(gemm_k_size),
+      ptr_A(const_cast<void *>(args.ptr_A)),
+      ptr_B(const_cast<void *>(args.ptr_B)),
+      ptr_C(const_cast<void *>(args.ptr_C)),
+      ptr_D(const_cast<void *>(args.ptr_D)),
+      batch_stride_A(args.batch_stride_A),
+      batch_stride_B(args.batch_stride_B),
+      batch_stride_C(args.batch_stride_C),
+      batch_stride_D(args.batch_stride_D),
+      semaphore(static_cast<int *>(workspace)) {
+    }
+
+    CUTLASS_HOST_DEVICE
+    void update(
+      Arguments const &args,
+      void *workspace = nullptr) {
+
+      ptr_A = const_cast<void *>(args.ptr_A);
+      ptr_B = const_cast<void *>(args.ptr_B);
+      ptr_C = const_cast<void *>(args.ptr_C);
+      ptr_D = args.ptr_D;
+
+      output_op = args.epilogue;
+
+      semaphore = static_cast<int *>(workspace);
+    }
+
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma1::SharedStorage mma1_main_loop;
+    typename Mma2::SharedStorage mma2_main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+public:
+
+  //
+  // Methods
+  //
+
+  CUTLASS_DEVICE
+  SymmUniversal() { } 
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(
+    cutlass::gemm::GemmCoord const & problem_size) {
+
+    static int const kAlignmentA = Mma1::IteratorA::AccessType::kElements;
+    static int const kAlignmentB = Mma1::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    if ((problem_size.m() % kAlignmentA) || (problem_size.k() % kAlignmentA) ||
+      (problem_size.n() % kAlignmentB) || (problem_size.k() % kAlignmentB) ||
+      (problem_size.m() % kAlignmentC) || (problem_size.n() % kAlignmentC)) {
+
+      return Status::kErrorMisalignedOperand;
+    }
+
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const &args) {
+    return can_implement(args.problem_size);
+  }
+
+  /// Executes two GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+      return;
+    }
+   
+    int offset_k = 0;
+    int problem_size_k = params.problem_size.k();
+
+    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A); 
+    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
+
+    //
+    // Fetch pointers based on mode.
+    //
+    if (params.mode == GemmUniversalMode::kGemm || 
+      params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+
+      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
+
+        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size; 
+      }
+
+      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
+    }
+
+    __syncthreads();
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_MxK_mma1{
+      threadblock_tile_offset.m() * Mma1::Shape::kM,
+      offset_k,
+    };
+
+    cutlass::MatrixCoord tb_offset_KxN_mma1{
+      offset_k,
+      threadblock_tile_offset.n() * Mma1::Shape::kN
+    };
+
+    cutlass::MatrixCoord tb_offset_MxK_mma2{
+      threadblock_tile_offset.m() * Mma1::Shape::kM,
+      offset_k,
+    };
+
+    cutlass::MatrixCoord tb_offset_KxN_mma2{
+      offset_k,
+      threadblock_tile_offset.n() * Mma1::Shape::kN
+    };
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply for Mma1
+    Mma1 mma1(shared_storage.mma1_main_loop, thread_idx, warp_idx, lane_idx);
+
+    // Construct thread-scoped matrix multiply for Mma2
+    Mma2 mma2(shared_storage.mma2_main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma1::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - offset_k + Mma1::Shape::kK - 1) / Mma1::Shape::kK;
+    int gemm_k_iterations_mma1 = gemm_k_iterations;
+    int gemm_k_iterations_mma2 = gemm_k_iterations;
+
+
+    /******************************************************************************************************
+     * SYMM (Side Mode, Fill Mode) is made of two TRMMs:
+      First TRMM (Mma1: Side Mode, Fill Mode, Non-Unit Diag): (A * B) or (B * A)
+      Second TRMM (Mma2: Side Mode, Inverted Fill Mode, Unit Diag): (AT * B) or (B * AT)
+
+     * For the first TRMM (Mma1) of SYMM, the following method is used to calculate the k-iterations:
+      First two cases: (Left Side, Lower Fill) and (Right Side, Upper Fill) are transpose of each other
+        - (Left Side, Lower Fill): calculate bottom of the CTA tile,  then find the k-iterations 
+                                    needed to process all elements till that coordinate.
+        - (Right Side, Upper Fill): calculate right end of the CTA tile,  then find the k-iterations 
+                                    needed to process all elements till that coordinate.
+
+      Last two cases: (Left Side, Upper Fill) and (Right Side, Lower Fill) are transpose of each other
+        - (Left Side, Upper Fill): calculate the top of the CTA tile, then find k-iterations 
+                                   that can be skipped for all elements of this tile.
+        - (Right Side, Lower Fill): calculate the left start of the CTA tile, then find k-iterations 
+                                    that can be skipped for all elements of this tile.
+
+      * For the second TRMM (Mma2) of SYMM, the k-iterations and threadblock offsets are calculated 
+        the same way as the first TRMM (Mma1) of same side mode but with inverted fill mode. 
+        For example, if the first TRMM is left sided with lower fill, the second TRMM would be 
+        left sided with upper fill.
+    ********************************************************************************************************/
+
+    if (kSideModeA == SideMode::kLeft && kFillModeA == FillMode::kLower) {
+
+      int k_iterations_till_diagonal_mma1 = ((threadblock_tile_offset.m() + 1) * Mma1::Shape::kM + Mma1::Shape::kK - 1) / Mma1::Shape::kK;
+      if (k_iterations_till_diagonal_mma1 < gemm_k_iterations) {
+        gemm_k_iterations_mma1  = k_iterations_till_diagonal_mma1;
+      }
+      
+      int k_iterations_till_diagonal_mma2 = ((threadblock_tile_offset.m()) * Mma1::Shape::kM) / Mma1::Shape::kK;
+      if (k_iterations_till_diagonal_mma2 != 0) {
+        tb_offset_MxK_mma2 += cutlass::MatrixCoord({0, k_iterations_till_diagonal_mma2 * Mma1::Shape::kK});
+        tb_offset_KxN_mma2 += cutlass::MatrixCoord({k_iterations_till_diagonal_mma2 * Mma1::Shape::kK, 0});
+        gemm_k_iterations_mma2 -= k_iterations_till_diagonal_mma2;
+      }
+
+    } else if (kSideModeA == SideMode::kRight && kFillModeA == FillMode::kUpper) {
+
+      int k_iterations_till_diagonal_mma1 = ((threadblock_tile_offset.n() + 1) * Mma1::Shape::kN + Mma1::Shape::kK - 1) / Mma1::Shape::kK;
+      if (k_iterations_till_diagonal_mma1 < gemm_k_iterations) {
+        gemm_k_iterations_mma1  = k_iterations_till_diagonal_mma1;
+      }
+
+      int k_iterations_till_diagonal_mma2 = ((threadblock_tile_offset.n()) * Mma1::Shape::kN) / Mma1::Shape::kK;
+      if (k_iterations_till_diagonal_mma2 != 0) {
+        tb_offset_MxK_mma2 += cutlass::MatrixCoord({0, k_iterations_till_diagonal_mma2 * Mma1::Shape::kK});
+        tb_offset_KxN_mma2 += cutlass::MatrixCoord({k_iterations_till_diagonal_mma2 * Mma1::Shape::kK, 0});
+        gemm_k_iterations_mma2 -= k_iterations_till_diagonal_mma2;
+      }
+
+    } else if (kSideModeA == SideMode::kLeft && kFillModeA == FillMode::kUpper) {
+
+      int k_iterations_till_diagonal_mma1 = ((threadblock_tile_offset.m()) * Mma1::Shape::kM) / Mma1::Shape::kK;
+      if (k_iterations_till_diagonal_mma1 != 0) {
+        tb_offset_MxK_mma1 += cutlass::MatrixCoord({0, k_iterations_till_diagonal_mma1 * Mma1::Shape::kK});
+        tb_offset_KxN_mma1 += cutlass::MatrixCoord({k_iterations_till_diagonal_mma1 * Mma1::Shape::kK, 0});
+        gemm_k_iterations_mma1  -= k_iterations_till_diagonal_mma1;
+      }
+
+      int k_iterations_till_diagonal_mma2 = ((threadblock_tile_offset.m() + 1) * Mma1::Shape::kM + Mma1::Shape::kK - 1) / Mma1::Shape::kK;
+      if (k_iterations_till_diagonal_mma2 < gemm_k_iterations) {
+        gemm_k_iterations_mma2  = k_iterations_till_diagonal_mma2;
+      }      
+
+    } else if (kSideModeA == SideMode::kRight && kFillModeA == FillMode::kLower) {
+
+      int k_iterations_till_diagonal_mma1 = ((threadblock_tile_offset.n()) * Mma1::Shape::kN) / Mma1::Shape::kK;
+
+      if (k_iterations_till_diagonal_mma1 != 0) {
+        tb_offset_MxK_mma1 += cutlass::MatrixCoord({0, k_iterations_till_diagonal_mma1 * Mma1::Shape::kK});
+        tb_offset_KxN_mma1 += cutlass::MatrixCoord({k_iterations_till_diagonal_mma1 * Mma1::Shape::kK, 0});
+        gemm_k_iterations_mma1 -= k_iterations_till_diagonal_mma1;
+      }
+
+      int k_iterations_till_diagonal_mma2 = ((threadblock_tile_offset.n() + 1) * Mma1::Shape::kN + Mma1::Shape::kK - 1) / Mma1::Shape::kK;
+      if (k_iterations_till_diagonal_mma2 < gemm_k_iterations) {
+        gemm_k_iterations_mma2  = k_iterations_till_diagonal_mma2;
+      }
+
+    }
+
+    // Construct iterators to A and B operands for Mma1
+    typename Mma1::IteratorA iterator_A_mma1(
+      params.params_A_mma1,
+      ptr_A,
+      {params.problem_size.m(), problem_size_k},
+      thread_idx,
+      tb_offset_MxK_mma1);
+
+    typename Mma1::IteratorB iterator_B_mma1(
+      params.params_B_mma1,
+      ptr_B,
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_KxN_mma1);
+
+    // Construct iterators to A and B operands for Mma2
+    typename Mma2::IteratorA iterator_A_mma2(
+      params.params_A_mma2,
+      ptr_A,
+      {params.problem_size.m(), problem_size_k},
+      thread_idx,
+      tb_offset_MxK_mma2);
+
+    typename Mma2::IteratorB iterator_B_mma2(
+      params.params_B_mma2,
+      ptr_B,
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_KxN_mma2);
+
+    // Compute threadblock-scoped matrix multiply-add (A x B) or (B x A)
+    mma1(
+      gemm_k_iterations_mma1, 
+      accumulators, 
+      iterator_A_mma1, 
+      iterator_B_mma1, 
+      accumulators);
+
+    // Compute threadblock-scoped matrix multiply-add (AT x B) or (B x AT)
+    mma2(
+      gemm_k_iterations_mma2, 
+      accumulators, 
+      iterator_A_mma2, 
+      iterator_B_mma2, 
+      accumulators);
+
+    //
+    // Epilogue
+    //
+
+    EpilogueOutputOp output_op(params.output_op);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    //assume identity swizzle
+    MatrixCoord threadblock_offset(
+      threadblock_tile_offset.m() * Mma1::Shape::kM,
+      threadblock_tile_offset.n() * Mma1::Shape::kN
+    );
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C); 
+    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
+
+    //
+    // Fetch pointers based on mode.
+    //
+    
+    // Construct the semaphore.
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+
+    if (params.mode == GemmUniversalMode::kGemm) {
+
+      // If performing a reduction via split-K, fetch the initial synchronization
+      if (params.grid_tiled_shape.k() > 1) {
+        
+        // Fetch the synchronization lock initially but do not block.
+        semaphore.fetch();
+
+        // Indicate which position in a serial reduction the output operator is currently updating
+        output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+      }
+    }
+    else if (params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_C += threadblock_tile_offset.k() * params.batch_stride_C;
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_C = static_cast<ElementC * const *>(params.ptr_C)[threadblock_tile_offset.k()];
+      ptr_D = static_cast<ElementC * const *>(params.ptr_D)[threadblock_tile_offset.k()];
+    }
+
+    // Tile iterator loading from source tensor.
+    typename Epilogue::OutputTileIterator iterator_C(
+      params.params_C,
+      ptr_C,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.params_D,
+      ptr_D,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    Epilogue epilogue(
+      shared_storage.epilogue, 
+      thread_idx, 
+      warp_idx, 
+      lane_idx);
+
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
+        
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_offset.k()) {
+        iterator_C = iterator_D;
+      }
+
+      semaphore.wait(threadblock_tile_offset.k());
+
+      __threadfence();
+    }
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(
+      output_op, 
+      iterator_D, 
+      accumulators, 
+      iterator_C); 
+    
+    //
+    // Release the semaphore
+    //
+
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) { 
+
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_offset.k() + 1;
+      }
+      
+      semaphore.release(lock);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/kernel/trmm_universal.h b/include/cutlass/gemm/kernel/trmm_universal.h
new file mode 100644
index 00000000..2591f961
--- /dev/null
+++ b/include/cutlass/gemm/kernel/trmm_universal.h
@@ -0,0 +1,599 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/semaphore.h"
+#include "cutlass/core_io.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate 
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_,   ///! Threadblock swizzling function
+  SideMode SideMode_,             ///! Side Mode for the kernel (kLeft or kRight)
+  FillMode FillMode_,             ///! Fill Mode for triangular matrix (kLower or kUpper)
+  DiagType DiagType_              ///! Diag Type for triangular matrix (kNonUnit or kUnit)
+>
+struct TrmmUniversal {
+public:
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+  static SideMode const kSideMode = SideMode_;
+  static FillMode const kFillMode = FillMode_;
+  static DiagType const kDiagType = DiagType_;
+
+  static ComplexTransform const kTransformA = Mma::kTransformA;
+  static ComplexTransform const kTransformB = Mma::kTransformB;
+  using Operator = typename Mma::Operator;
+
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma::ArchTag;
+
+  static int const kStages = Mma::kStages;
+  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Split-K preserves splits that are 128b aligned
+  static int const kSplitKAlignment = const_max(128 / sizeof_bits<ElementA>::value, 128 / sizeof_bits<ElementB>::value);
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmUniversalMode mode;
+    GemmCoord problem_size;
+    int batch_count;
+
+    typename EpilogueOutputOp::Params epilogue;
+
+    void const * ptr_A;
+    void const * ptr_B;
+    void * ptr_D;
+
+    int64_t batch_stride_A;
+    int64_t batch_stride_B;
+    int64_t batch_stride_D;
+
+    typename LayoutA::Stride::Index lda;
+    typename LayoutB::Stride::Index ldb;
+    typename LayoutC::Stride::Index ldd;
+
+    //
+    // Methods
+    //
+    
+    Arguments(): 
+      mode(GemmUniversalMode::kGemm), 
+      batch_count(1), 
+      ptr_A(nullptr), ptr_B(nullptr), ptr_D(nullptr) { }
+
+    /// constructs an arguments structure
+    Arguments(
+      GemmUniversalMode mode,
+      GemmCoord problem_size,
+      int batch_count,
+      typename EpilogueOutputOp::Params epilogue,
+      void const * ptr_A,
+      void const * ptr_B,
+      void * ptr_D,
+      int64_t batch_stride_A,
+      int64_t batch_stride_B,
+      int64_t batch_stride_D,
+      typename LayoutA::Stride::Index lda,
+      typename LayoutB::Stride::Index ldb,
+      typename LayoutC::Stride::Index ldd
+    ):
+      mode(mode), 
+      problem_size(problem_size),
+      batch_count(batch_count),
+      epilogue(epilogue), 
+      ptr_A(ptr_A), ptr_B(ptr_B), ptr_D(ptr_D), 
+      batch_stride_A(batch_stride_A), batch_stride_B(batch_stride_B), batch_stride_D(batch_stride_D), 
+      lda(lda), ldb(ldb), ldd(ldd) {
+      }
+
+    /// Returns arguments for the transposed problem sizes
+    Arguments transposed_problem_size() const {
+      Arguments args(*this);
+
+      std::swap(args.problem_size.m(), args.problem_size.n());
+
+      return args;
+    }
+
+    /// Returns arguments for the transposed matrices
+    Arguments swapped_matrices() const {
+      Arguments args(*this);
+
+      std::swap(args.ptr_A, args.ptr_B);
+      std::swap(args.lda, args.ldb);
+      std::swap(args.batch_stride_A, args.batch_stride_B);
+
+      return args;
+    }
+  };
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params {
+
+    cutlass::gemm::GemmCoord problem_size;
+    cutlass::gemm::GemmCoord grid_tiled_shape;
+    int swizzle_log_tile;
+   
+    typename Mma::IteratorA::Params params_A;
+    typename Mma::IteratorB::Params params_B;
+    typename Epilogue::OutputTileIterator::Params params_D;
+    
+    typename EpilogueOutputOp::Params output_op;
+
+    GemmUniversalMode mode;
+    int batch_count;
+    int gemm_k_size;
+
+    void * ptr_A;
+    void * ptr_B;
+    void * ptr_D;
+
+    int64_t batch_stride_A;
+    int64_t batch_stride_B;
+    int64_t batch_stride_D;
+
+    int *semaphore;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params():
+      swizzle_log_tile(0),
+      params_A(0),
+      params_B(0),
+      params_D(0),
+      batch_count(0),
+      gemm_k_size(0),
+      mode(cutlass::gemm::GemmUniversalMode::kGemm),
+      ptr_A(nullptr),
+      ptr_B(nullptr),
+      ptr_D(nullptr),
+      batch_stride_A(0),
+      batch_stride_B(0),
+      batch_stride_D(0),
+      semaphore(nullptr) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      Arguments const &args,
+      cutlass::gemm::GemmCoord const & grid_tiled_shape,
+      int gemm_k_size,
+      void *workspace = nullptr
+    ):
+      problem_size(args.problem_size),
+      grid_tiled_shape(grid_tiled_shape),
+      swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
+      params_A(args.lda),
+      params_B(args.ldb),
+      params_D(args.ldd),
+      output_op(args.epilogue),
+      mode(args.mode),
+      batch_count(args.batch_count),
+      gemm_k_size(gemm_k_size),
+      ptr_A(const_cast<void *>(args.ptr_A)),
+      ptr_B(const_cast<void *>(args.ptr_B)),
+      ptr_D(args.ptr_D),
+      batch_stride_A(args.batch_stride_A),
+      batch_stride_B(args.batch_stride_B),
+      batch_stride_D(args.batch_stride_D),
+      semaphore(static_cast<int *>(workspace)) {
+    }
+
+    CUTLASS_HOST_DEVICE
+    void update(
+      Arguments const &args,
+      void *workspace = nullptr) {
+
+      ptr_A = const_cast<void *>(args.ptr_A);
+      ptr_B = const_cast<void *>(args.ptr_B);
+      ptr_D = args.ptr_D;
+
+      batch_stride_A = args.batch_stride_A;
+      batch_stride_B = args.batch_stride_B;
+      batch_stride_D = args.batch_stride_D;
+
+      output_op = args.epilogue;
+
+      semaphore = static_cast<int *>(workspace);
+    }
+
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+public:
+
+  //
+  // Methods
+  //
+
+  CUTLASS_DEVICE
+  TrmmUniversal() { } 
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(
+    cutlass::gemm::GemmCoord const & problem_size) {
+
+    static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    if ((problem_size.m() % kAlignmentA) || (problem_size.k() % kAlignmentA) ||
+      (problem_size.n() % kAlignmentB) || (problem_size.k() % kAlignmentB) ||
+      (problem_size.m() % kAlignmentC) || (problem_size.n() % kAlignmentC)) {
+
+      return Status::kErrorMisalignedOperand;
+    }
+
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const &args) {
+    return can_implement(args.problem_size);
+  }
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+    int offset_k = 0;
+    int problem_size_k = params.problem_size.k();
+
+    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A); 
+    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
+
+    //
+    // Fetch pointers based on mode.
+    //
+    if (params.mode == GemmUniversalMode::kGemm || 
+      params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+
+      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
+
+        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size; 
+      }
+
+      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_A += threadblock_tile_offset.k() * params.batch_stride_A;
+      ptr_B += threadblock_tile_offset.k() * params.batch_stride_B;
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_A = static_cast<ElementA * const *>(params.ptr_A)[threadblock_tile_offset.k()];
+      ptr_B = static_cast<ElementB * const *>(params.ptr_B)[threadblock_tile_offset.k()];
+    }
+
+    __syncthreads();
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_A{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      offset_k,
+    };
+
+    cutlass::MatrixCoord tb_offset_B{
+      offset_k,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    };
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - offset_k + Mma::Shape::kK - 1) / Mma::Shape::kK;
+    
+    /******************************************************************************************************
+      First two cases: (Left Side, Lower Fill) and (Right Side, Upper Fill) are transpose of each other
+        - (Left Side, Lower Fill): calculate bottom of the CTA tile,  then find the k-iterations 
+                                    needed to process all elements till that coordinate.
+        - (Right Side, Upper Fill): calculate right end of the CTA tile,  then find the k-iterations 
+                                    needed to process all elements till that coordinate.
+
+      Last two cases: (Left Side, Upper Fill) and (Right Side, Lower Fill) are transpose of each other
+        - (Left Side, Upper Fill): calculate the top of the CTA tile, then find k-iterations 
+                                   that can be skipped for all elements of this tile.
+        - (Right Side, Lower Fill): calculate the left start of the CTA tile, then find k-iterations 
+                                    that can be skipped for all elements of this tile.
+    ********************************************************************************************************/
+ 
+    if (kSideMode == SideMode::kLeft && kFillMode == FillMode::kLower) {
+
+      int k_iterations_till_diagonal = ((threadblock_tile_offset.m() + 1) * Mma::Shape::kM + Mma::Shape::kK - 1) / Mma::Shape::kK;
+      if (k_iterations_till_diagonal < gemm_k_iterations) {
+        gemm_k_iterations = k_iterations_till_diagonal;
+      }
+
+    } else if (kSideMode == SideMode::kRight && kFillMode == FillMode::kUpper) {
+
+      int k_iterations_till_diagonal = ((threadblock_tile_offset.n() + 1) * Mma::Shape::kN + Mma::Shape::kK - 1) / Mma::Shape::kK;
+      if (k_iterations_till_diagonal < gemm_k_iterations) {
+        gemm_k_iterations = k_iterations_till_diagonal;
+      }
+
+    } else if (kSideMode == SideMode::kLeft && kFillMode == FillMode::kUpper) {
+
+      int k_iterations_till_diagonal = ((threadblock_tile_offset.m()) * Mma::Shape::kM) / Mma::Shape::kK;
+
+      if (k_iterations_till_diagonal != 0) {
+        tb_offset_A += cutlass::MatrixCoord({0, k_iterations_till_diagonal * Mma::Shape::kK});
+        tb_offset_B += cutlass::MatrixCoord({k_iterations_till_diagonal * Mma::Shape::kK, 0});
+        gemm_k_iterations -= k_iterations_till_diagonal;
+      }
+
+    } else if (kSideMode == SideMode::kRight && kFillMode == FillMode::kLower) {
+
+      int k_iterations_till_diagonal = ((threadblock_tile_offset.n()) * Mma::Shape::kN) / Mma::Shape::kK;
+
+      if (k_iterations_till_diagonal != 0) {
+        tb_offset_A += cutlass::MatrixCoord({0, k_iterations_till_diagonal * Mma::Shape::kK});
+        tb_offset_B += cutlass::MatrixCoord({k_iterations_till_diagonal * Mma::Shape::kK, 0});
+        gemm_k_iterations -= k_iterations_till_diagonal;
+      }
+
+    }
+
+    // Construct iterators to A and B operands
+    typename Mma::IteratorA iterator_A(
+      params.params_A,
+      ptr_A,
+      {params.problem_size.m(), problem_size_k},
+      thread_idx,
+      tb_offset_A);
+
+    typename Mma::IteratorB iterator_B(
+      params.params_B,
+      ptr_B,
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_B);
+
+    // Compute threadblock-scoped matrix multiply-add
+    mma(
+      gemm_k_iterations, 
+      accumulators, 
+      iterator_A, 
+      iterator_B, 
+      accumulators);
+
+    //
+    // Epilogue
+    //
+
+    EpilogueOutputOp output_op(params.output_op);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    //assume identity swizzle
+    MatrixCoord threadblock_offset(
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    );
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
+
+    //
+    // Fetch pointers based on mode.
+    //
+    
+    // Construct the semaphore.
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+
+    if (params.mode == GemmUniversalMode::kGemm) {
+
+      // If performing a reduction via split-K, fetch the initial synchronization
+      if (params.grid_tiled_shape.k() > 1) {
+        
+        // Fetch the synchronization lock initially but do not block.
+        semaphore.fetch();
+
+        // Indicate which position in a serial reduction the output operator is currently updating
+        output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+      }
+    }
+    else if (params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_D = static_cast<ElementC * const *>(params.ptr_D)[threadblock_tile_offset.k()];
+    }
+
+    
+    // Tile iterator loading from source tensor (although irrelevant to this kernel as beta is zero).
+    typename Epilogue::OutputTileIterator iterator_C(
+      params.params_D,
+      ptr_D,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.params_D,
+      ptr_D,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    Epilogue epilogue(
+      shared_storage.epilogue, 
+      thread_idx, 
+      warp_idx, 
+      lane_idx);
+
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
+        
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_offset.k()) {
+        iterator_C = iterator_D;
+      }
+
+      semaphore.wait(threadblock_tile_offset.k());
+
+      __threadfence();
+    }
+
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(
+      output_op, 
+      iterator_D, 
+      accumulators, 
+      iterator_C); 
+    
+    //
+    // Release the semaphore
+    //
+
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) { 
+
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_offset.k() + 1;
+      }
+      
+      semaphore.release(lock);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/thread/mma.h b/include/cutlass/gemm/thread/mma.h
index c58c9103..ca6c4b2c 100644
--- a/include/cutlass/gemm/thread/mma.h
+++ b/include/cutlass/gemm/thread/mma.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/thread/mma_sm50.h b/include/cutlass/gemm/thread/mma_sm50.h
index 25232a9e..25336424 100644
--- a/include/cutlass/gemm/thread/mma_sm50.h
+++ b/include/cutlass/gemm/thread/mma_sm50.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -106,6 +112,8 @@ struct MmaGeneric {
     ElementC, LayoutC,
     Operator>;
 
+  static bool const kMultipleOf2 = ((Shape::kM % 2 == 0) && (Shape::kN % 2 == 0));
+
   //
   // Methods
   //
@@ -135,30 +143,106 @@ struct MmaGeneric {
     // Compute matrix product
     CUTLASS_PRAGMA_UNROLL
     for (int k = 0; k < Shape::kK; ++k) {
+      #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 860)
+      if (kMultipleOf2 && platform::is_same<ElementA, float>::value && platform::is_same<ElementB, float>::value && platform::is_same<ElementC, float>::value) {
 
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = 0; n < Shape::kN; ++n) {
-
+        //2x2 zigzag - m and n loops to increment by 2. Inner loop to process 4 multiply-adds in a 2x2 tile.
         CUTLASS_PRAGMA_UNROLL
-        for (int m = 0; m < Shape::kM; ++m) {
-
-          int m_serpentine = (n % 2) ? (Shape::kM - 1 - m) : m;
-
-          MatrixCoord mn(m_serpentine, n);
-          MatrixCoord mk(m_serpentine, k);
-          MatrixCoord kn(k, n);
-
-          Array<ElementC, 1> d;
-          Array<ElementA, 1> a;
-          Array<ElementB, 1> b;
-
-          d[0] = d_ref.at(mn);
-          a[0] = a_ref.at(mk);
-          b[0] = b_ref.at(kn);
-
-          mma_op(d, a, b, d);
-
-          d_ref.at(mn) = d[0];
+        for (int n = 0; n < Shape::kN; n+=2) {
+  
+          CUTLASS_PRAGMA_UNROLL
+          for (int m = 0; m < Shape::kM; m+=2) {
+  
+            int m_serpentine = (n % 4) ? (Shape::kM - 2 - m) : m;
+  
+            //top-left element in 2x2 tile
+            {
+              MatrixCoord mn(m_serpentine, n);
+              MatrixCoord mk(m_serpentine, k);
+              MatrixCoord kn(k, n);
+              Array<ElementC, 1> d;
+              Array<ElementA, 1> a;
+              Array<ElementB, 1> b;
+              d[0] = d_ref.at(mn);
+              a[0] = a_ref.at(mk);
+              b[0] = b_ref.at(kn);
+              mma_op(d, a, b, d);
+              d_ref.at(mn) = d[0];
+            }
+  
+            //bottom-left element in 2x2 tile
+            {
+              MatrixCoord mn(m_serpentine+1, n);
+              MatrixCoord mk(m_serpentine+1, k);
+              MatrixCoord kn(k, n);
+              Array<ElementC, 1> d;
+              Array<ElementA, 1> a;
+              Array<ElementB, 1> b;
+              d[0] = d_ref.at(mn);
+              a[0] = a_ref.at(mk);
+              b[0] = b_ref.at(kn);
+              mma_op(d, a, b, d);
+              d_ref.at(mn) = d[0];
+            }
+  
+            //bottom-right element in 2x2 tile
+            {
+              MatrixCoord mn(m_serpentine+1, n+1);
+              MatrixCoord mk(m_serpentine+1, k);
+              MatrixCoord kn(k, n+1);
+              Array<ElementC, 1> d;
+              Array<ElementA, 1> a;
+              Array<ElementB, 1> b;
+              d[0] = d_ref.at(mn);
+              a[0] = a_ref.at(mk);
+              b[0] = b_ref.at(kn);
+              mma_op(d, a, b, d);
+              d_ref.at(mn) = d[0];
+            }
+  
+            //top-right element in 2x2 tile
+            {
+              MatrixCoord mn(m_serpentine, n+1);
+              MatrixCoord mk(m_serpentine, k);
+              MatrixCoord kn(k, n+1);
+              Array<ElementC, 1> d;
+              Array<ElementA, 1> a;
+              Array<ElementB, 1> b;
+              d[0] = d_ref.at(mn);
+              a[0] = a_ref.at(mk);
+              b[0] = b_ref.at(kn);
+              mma_op(d, a, b, d);
+              d_ref.at(mn) = d[0];
+            }
+          }
+        }
+      } else 
+      #endif
+      {
+        CUTLASS_PRAGMA_UNROLL
+        for (int n = 0; n < Shape::kN; ++n) {
+  
+          CUTLASS_PRAGMA_UNROLL
+          for (int m = 0; m < Shape::kM; ++m) {
+  
+            int m_serpentine = (n % 2) ? (Shape::kM - 1 - m) : m;
+  
+            MatrixCoord mn(m_serpentine, n);
+            MatrixCoord mk(m_serpentine, k);
+            MatrixCoord kn(k, n);
+  
+            Array<ElementC, 1> d;
+            Array<ElementA, 1> a;
+            Array<ElementB, 1> b;
+  
+            d[0] = d_ref.at(mn);
+            a[0] = a_ref.at(mk);
+            b[0] = b_ref.at(kn);
+  
+            mma_op(d, a, b, d);
+  
+            d_ref.at(mn) = d[0];
+          }
         }
       }
     }
diff --git a/include/cutlass/gemm/thread/mma_sm60.h b/include/cutlass/gemm/thread/mma_sm60.h
index 6b604767..dbf23d2c 100644
--- a/include/cutlass/gemm/thread/mma_sm60.h
+++ b/include/cutlass/gemm/thread/mma_sm60.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/thread/mma_sm61.h b/include/cutlass/gemm/thread/mma_sm61.h
index f3ffb0dc..63094130 100644
--- a/include/cutlass/gemm/thread/mma_sm61.h
+++ b/include/cutlass/gemm/thread/mma_sm61.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/threadblock/default_gemv_core.h b/include/cutlass/gemm/threadblock/default_gemv_core.h
index 1ea0cfe6..36bb2339 100755
--- a/include/cutlass/gemm/threadblock/default_gemv_core.h
+++ b/include/cutlass/gemm/threadblock/default_gemv_core.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/threadblock/default_mma.h b/include/cutlass/gemm/threadblock/default_mma.h
index 40b9b34b..d1ffc819 100644
--- a/include/cutlass/gemm/threadblock/default_mma.h
+++ b/include/cutlass/gemm/threadblock/default_mma.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -90,7 +96,11 @@ template <
     /// when output layout is interleaved.
     bool AccumulatorsInRowMajor = false,
     /// Use zfill or predicate for out-of-bound cp.async
-    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone
+    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone,
+    /// Gather operand A by using an index array
+    bool GatherA = false,
+    /// Gather operand B by using an index array
+    bool GatherB = false
     >
 struct DefaultMma;
 
@@ -123,12 +133,17 @@ template <
     /// Instruction-level tile size (concept: GemmShape)
     typename InstructionShape,
     /// Operation performed by GEMM
-    typename Operator
+    typename Operator,
+    /// Gather operand A by using an index array
+    bool GatherA,
+    /// Gather operand B by using an index array
+    bool GatherB
     >
 struct DefaultMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
                   kAlignmentB, ElementAccumulator, LayoutC,
                   arch::OpClassSimt, ArchTag, ThreadblockShape, WarpShape,
-                  InstructionShape, 2, Operator, false, SharedMemoryClearOption::kNone> {
+                  InstructionShape, 2, Operator, false, SharedMemoryClearOption::kNone,
+                  GatherA, GatherB> {
 
   static_assert(platform::is_same<LayoutC, layout::RowMajor>::value
              || platform::is_same<LayoutC, layout::AffineRankN<2>>::value,
@@ -144,13 +159,13 @@ struct DefaultMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
   using IteratorA =
       cutlass::transform::threadblock::PredicatedTileIterator<
           cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>,
-          ElementA, LayoutA, 1, typename MmaCore::IteratorThreadMapA, kAlignmentA>;
+          ElementA, LayoutA, 1, typename MmaCore::IteratorThreadMapA, kAlignmentA, GatherA>;
 
   // Define iterators over tiles from the B operand
   using IteratorB =
       cutlass::transform::threadblock::PredicatedTileIterator<
           cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>,
-          ElementB, LayoutB, 0, typename MmaCore::IteratorThreadMapB, kAlignmentB>;
+          ElementB, LayoutB, 0, typename MmaCore::IteratorThreadMapB, kAlignmentB, GatherB>;
 
   // Define the threadblock-scoped pipelined matrix multiply
   using ThreadblockMma = cutlass::gemm::threadblock::MmaPipelined<
@@ -186,12 +201,19 @@ template <
     /// Instruction-level tile size (concept: GemmShape)
     typename InstructionShape,
     /// Operation performed by GEMM
-    typename Operator
+    typename Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear,
+    /// Gather operand A by using an index array
+    bool GatherA,
+    /// Gather operand B by using an index array
+    bool GatherB
     >
 struct DefaultMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
                   kAlignmentB, ElementAccumulator, layout::RowMajor,
                   arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape,
-                  InstructionShape, 2, Operator, false, SharedMemoryClearOption::kNone> {
+                  InstructionShape, 2, Operator, false, SharedMemoryClear,
+                  GatherA, GatherB> {
   // Define the MmaCore components
   using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
       ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
@@ -202,13 +224,15 @@ struct DefaultMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
   using IteratorA =
       cutlass::transform::threadblock::PredicatedTileIterator<
           cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>,
-          ElementA, LayoutA, 1, typename MmaCore::IteratorThreadMapA, kAlignmentA>;
+          ElementA, LayoutA, 1, typename MmaCore::IteratorThreadMapA, kAlignmentA,
+          GatherA>;
 
   // Define iterators over tiles from the B operand
   using IteratorB =
       cutlass::transform::threadblock::PredicatedTileIterator<
           cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>,
-          ElementB, LayoutB, 0, typename MmaCore::IteratorThreadMapB, kAlignmentB>;
+          ElementB, LayoutB, 0, typename MmaCore::IteratorThreadMapB, kAlignmentB,
+          GatherB>;
 
   // Define the threadblock-scoped pipelined matrix multiply
   using ThreadblockMma = cutlass::gemm::threadblock::MmaPipelined<
@@ -237,12 +261,17 @@ template <
     /// Instruction-level tile size (concept: GemmShape)
     typename InstructionShape,
     /// Operation performed by GEMM
-    typename Operator
+    typename Operator,
+    /// Gather operand A by using an index array
+    bool GatherA,
+    /// Gather operand B by using an index array
+    bool GatherB
     >
 struct DefaultMma<float, LayoutA, kAlignmentA, float, LayoutB,
                   kAlignmentB, float, layout::RowMajor,
                   arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape,
-                  InstructionShape, 2, Operator, false, SharedMemoryClearOption::kNone> {
+                  InstructionShape, 2, Operator, false, SharedMemoryClearOption::kNone,
+                  GatherA, GatherB> {
   // Define the MmaCore components
   using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
       ThreadblockShape, WarpShape, InstructionShape, float, LayoutA, float,
@@ -253,13 +282,13 @@ struct DefaultMma<float, LayoutA, kAlignmentA, float, LayoutB,
   using IteratorA =
       cutlass::transform::threadblock::PredicatedTileIterator<
           cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>,
-          float, LayoutA, 1, typename MmaCore::IteratorThreadMapA, kAlignmentA>;
+          float, LayoutA, 1, typename MmaCore::IteratorThreadMapA, kAlignmentA, GatherA>;
 
   // Define iterators over tiles from the B operand
   using IteratorB =
       cutlass::transform::threadblock::PredicatedTileIterator<
           cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>,
-          float, LayoutB, 0, typename MmaCore::IteratorThreadMapB, kAlignmentB>;
+          float, LayoutB, 0, typename MmaCore::IteratorThreadMapB, kAlignmentB, GatherB>;
 
   // Define the threadblock-scoped pipelined matrix multiply
   using ThreadblockMma = cutlass::gemm::threadblock::MmaPipelined<
@@ -304,7 +333,7 @@ struct DefaultMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
                   kAlignmentB, ElementAccumulator,
                   layout::ColumnMajorInterleaved<InterleavedK>, OperatorClass,
                   ArchTag, ThreadblockShape, WarpShape, InstructionShape, 2,
-                  Operator, true, SharedMemoryClearOption::kNone> {
+                  Operator, true, SharedMemoryClearOption::kNone, false, false> {
   // Define the MmaCore components
   using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
       ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
@@ -367,12 +396,17 @@ template <
     /// Number of stages used in the multistage mainloop
     int Stages,
     /// Operation perfomed by GEMM
-    typename Operator
+    typename Operator,
+    /// Gather operand A by using an index array
+    bool GatherA,
+    /// Gather operand B by using an index array
+    bool GatherB
     >
 struct DefaultMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
                   kAlignmentB, ElementAccumulator, LayoutC,
                   arch::OpClassSimt, ArchTag, ThreadblockShape, WarpShape,
-                  InstructionShape, Stages, Operator, false, SharedMemoryClearOption::kNone> {
+                  InstructionShape, Stages, Operator, false, SharedMemoryClearOption::kNone,
+                  GatherA, GatherB> {
 
   static_assert(platform::is_same<LayoutC, layout::RowMajor>::value
              || platform::is_same<LayoutC, layout::AffineRankN<2>>::value,
@@ -390,7 +424,7 @@ struct DefaultMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
   using IteratorA =
       cutlass::transform::threadblock::PredicatedTileAccessIterator<
           cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-          ElementA, LayoutA, 1, ThreadMapA, AccessTypeA>;
+          ElementA, LayoutA, 1, ThreadMapA, AccessTypeA, GatherA>;
 
   // Define iterators over tiles from the B operand
   using ThreadMapB = typename MmaCore::IteratorThreadMapB;
@@ -398,7 +432,7 @@ struct DefaultMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
   using IteratorB =
       cutlass::transform::threadblock::PredicatedTileAccessIterator<
           cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-          ElementB, LayoutB, 0, ThreadMapB, AccessTypeB>;
+          ElementB, LayoutB, 0, ThreadMapB, AccessTypeB, GatherB>;
 
   // Define the threadblock-scoped multistage matrix multiply
   using ThreadblockMma = cutlass::gemm::threadblock::MmaMultistage<
@@ -441,12 +475,17 @@ template <
     /// Operation perfomed by GEMM
     typename Operator,
     /// Use zfill or predicate for out-of-bound cp.async
-    SharedMemoryClearOption SharedMemoryClear
+    SharedMemoryClearOption SharedMemoryClear,
+    /// Gather operand A by using an index array
+    bool GatherA,
+    /// Gather operand B by using an index array
+    bool GatherB
     >
 struct DefaultMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
                   kAlignmentB, ElementAccumulator, LayoutC,
                   arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape,
-                  InstructionShape, Stages, Operator, false, SharedMemoryClear> {
+                  InstructionShape, Stages, Operator, false, SharedMemoryClear,
+                  GatherA, GatherB> {
 
   static_assert(platform::is_same<LayoutC, layout::RowMajor>::value
              || platform::is_same<LayoutC, layout::AffineRankN<2>>::value,
@@ -474,7 +513,7 @@ struct DefaultMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
   using IteratorA =
       cutlass::transform::threadblock::PredicatedTileAccessIterator<
           cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
-          ElementA, LayoutA, 1, ThreadMapA, AccessTypeA>;
+          ElementA, LayoutA, 1, ThreadMapA, AccessTypeA, GatherA>;
 
   // Define iterators over tiles from the B operand
   using ThreadMapB = typename MmaCore::IteratorThreadMapB;
@@ -482,7 +521,7 @@ struct DefaultMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
   using IteratorB =
       cutlass::transform::threadblock::PredicatedTileAccessIterator<
           cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
-          ElementB, LayoutB, 0, ThreadMapB, AccessTypeB>;
+          ElementB, LayoutB, 0, ThreadMapB, AccessTypeB, GatherB>;
 
   // Define the threadblock-scoped multistage matrix multiply
   using ThreadblockMma = cutlass::gemm::threadblock::MmaMultistage<
@@ -530,7 +569,7 @@ struct DefaultMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
                   kAlignmentB, ElementAccumulator,
                   layout::ColumnMajorInterleaved<InterleavedK>, OperatorClass,
                   ArchTag, ThreadblockShape, WarpShape, InstructionShape,
-                  Stages, Operator, true, SharedMemoryClearOption::kNone> {
+                  Stages, Operator, true, SharedMemoryClearOption::kNone, false, false> {
   // Define the MmaCore components
   using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
       ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
@@ -587,7 +626,7 @@ template <
 struct DefaultMma<int8_t, LayoutA, kAlignmentA, int8_t, LayoutB, kAlignmentB,
                   ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
                   ArchTag, ThreadblockShape, WarpShape, GemmShape<1, 1, 4>, 2,
-                  Operator, false, SharedMemoryClearOption::kNone> {
+                  Operator, false, SharedMemoryClearOption::kNone, false, false> {
   using InstructionShape = GemmShape<1, 1, 4>;
   using ElementA = int8_t;
   using ElementB = int8_t;
@@ -655,7 +694,8 @@ template <
 struct DefaultMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
                   kAlignmentB, ElementAccumulator, LayoutC,
                   arch::OpClassWmmaTensorOp, ArchTag, ThreadblockShape, WarpShape,
-                  InstructionShape, 2, Operator, false, SharedMemoryClearOption::kNone> {
+                  InstructionShape, 2, Operator, false, SharedMemoryClearOption::kNone,
+                  false, false> {
   // Define the MmaCore components
   using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
       ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
@@ -714,7 +754,8 @@ template <
 struct DefaultMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
                   kAlignmentB, ElementAccumulator, LayoutC,
                   arch::OpClassWmmaTensorOp, ArchTag, ThreadblockShape, WarpShape,
-                  InstructionShape, 1, Operator, false, SharedMemoryClearOption::kNone> {
+                  InstructionShape, 1, Operator, false, SharedMemoryClearOption::kNone,
+                  false, false> {
   // Define the MmaCore components
   using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
       ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
diff --git a/include/cutlass/gemm/threadblock/default_mma_core.h b/include/cutlass/gemm/threadblock/default_mma_core.h
index 9fb09f73..1b67f345 100644
--- a/include/cutlass/gemm/threadblock/default_mma_core.h
+++ b/include/cutlass/gemm/threadblock/default_mma_core.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/threadblock/default_mma_core_simt.h b/include/cutlass/gemm/threadblock/default_mma_core_simt.h
index f671d310..d850c0f1 100644
--- a/include/cutlass/gemm/threadblock/default_mma_core_simt.h
+++ b/include/cutlass/gemm/threadblock/default_mma_core_simt.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/threadblock/default_mma_core_sm70.h b/include/cutlass/gemm/threadblock/default_mma_core_sm70.h
index e167c820..deb220dc 100644
--- a/include/cutlass/gemm/threadblock/default_mma_core_sm70.h
+++ b/include/cutlass/gemm/threadblock/default_mma_core_sm70.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/threadblock/default_mma_core_sm75.h b/include/cutlass/gemm/threadblock/default_mma_core_sm75.h
index a8504191..09abd0ec 100644
--- a/include/cutlass/gemm/threadblock/default_mma_core_sm75.h
+++ b/include/cutlass/gemm/threadblock/default_mma_core_sm75.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/threadblock/default_mma_core_sm80.h b/include/cutlass/gemm/threadblock/default_mma_core_sm80.h
index 221f5a8e..bfa04d44 100644
--- a/include/cutlass/gemm/threadblock/default_mma_core_sm80.h
+++ b/include/cutlass/gemm/threadblock/default_mma_core_sm80.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/threadblock/default_mma_core_sparse_sm80.h b/include/cutlass/gemm/threadblock/default_mma_core_sparse_sm80.h
index c68794ec..e5d54d35 100644
--- a/include/cutlass/gemm/threadblock/default_mma_core_sparse_sm80.h
+++ b/include/cutlass/gemm/threadblock/default_mma_core_sparse_sm80.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/threadblock/default_mma_core_with_access_size.h b/include/cutlass/gemm/threadblock/default_mma_core_with_access_size.h
new file mode 100644
index 00000000..cf4de84f
--- /dev/null
+++ b/include/cutlass/gemm/threadblock/default_mma_core_with_access_size.h
@@ -0,0 +1,291 @@
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/gemm/warp/mma.h"
+#include "cutlass/gemm/threadblock/mma_pipelined.h"
+#include "cutlass/gemm/threadblock/mma_singlestage.h"
+#include "cutlass/arch/cache_operation.h" 
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+template <
+    /// Shape of threadblock-scoped matrix multiply operator
+    typename Shape,
+    /// Shape of warp-level matrix multiply operator
+    typename WarpShape,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape,
+    /// Element data type of A operand
+    typename ElementA,
+    /// Layout of operand A
+    typename LayoutA,
+    /// Element data type of B operand
+    typename ElementB,
+    /// Layout of operand B
+    typename LayoutB,
+    /// Data type of accumulator
+    typename ElementC,
+    /// Layout of accumulator
+    typename LayoutC,
+    /// Indicates type of math operator (arch::OpClassSimt or arch::OpClassTensorOp)
+    typename OperatorClass,
+    /// Size of a threadblock-scoped access
+    int kAccessSizeInBits = -1, // -1 denoting the default
+    /// Number of stages
+    int Stages = 2,
+    /// Operation performed by MMA
+    typename Operator = typename platform::conditional<
+        (platform::is_same<OperatorClass,
+                           cutlass::arch::OpClassTensorOp>::value) &&
+            (platform::is_same<ElementA, int8_t>::value ||
+             platform::is_same<ElementA, int4b_t>::value ||
+             platform::is_same<ElementA, uint8_t>::value ||
+             platform::is_same<ElementA, uint4b_t>::value),
+        cutlass::arch::OpMultiplyAddSaturate,
+        cutlass::arch::OpMultiplyAdd>::type,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor = false,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA =
+        cutlass::arch::CacheOperation::Global,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB =
+        cutlass::arch::CacheOperation::Global,
+    /// per-element transformation for elements of A
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// per-element transformation for elements of B
+    ComplexTransform TransformB = ComplexTransform::kNone,
+    bool IsComplex = false // (is_complex<ElementA>::value || is_complex<ElementB>::value)
+>
+struct DefaultMmaCoreWithAccessSize;
+
+template <
+    /// Shape of threadblock-scoped matrix multiply operator
+    typename Shape,
+    /// Shape of warp-level matrix multiply operator
+    typename WarpShape,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape,
+    /// Element data type of A operand
+    typename ElementA,
+    /// Layout of operand A
+    typename LayoutA,
+    /// Element data type of B operand
+    typename ElementB,
+    /// Layout of operand B
+    typename LayoutB,
+    /// Data type of accumulator
+    typename ElementC,
+    /// Layout of accumulator
+    typename LayoutC,
+    /// Indicates type of math operator (arch::OpClassSimt or arch::OpClassTensorOp)
+    typename OperatorClass,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by MMA
+    typename Operator,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// per-element transformation for elements of A
+    ComplexTransform TransformA,
+    /// per-element transformation for elements of B
+    ComplexTransform TransformB,
+    bool IsComplex
+>
+struct DefaultMmaCoreWithAccessSize<
+    Shape, WarpShape, InstructionShape,
+    ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+    OperatorClass, -1, Stages, Operator, AccumulatorsInRowMajor,
+    CacheOpA, CacheOpB, TransformA, TransformB, IsComplex
+> : DefaultMmaCore<
+    Shape, WarpShape, InstructionShape,
+    ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+    OperatorClass, Stages, Operator, AccumulatorsInRowMajor,
+    CacheOpA, CacheOpB, TransformA, TransformB, IsComplex
+> {};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: column-major
+///   B: row-major
+///   Operator: simt class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Size of a threadblock-scoped access (a value of -1 indicates the default)
+    int kAccessSizeInBits_,
+    /// Operation performed by GEMM
+    typename Operator_>
+struct DefaultMmaCoreWithAccessSize<Shape_, WarpShape_, typename std::enable_if<kAccessSizeInBits_ != -1, GemmShape<1, 1, 1>>::type, ElementA_,
+                      layout::ColumnMajor, ElementB_, layout::RowMajor,
+                      ElementC_, LayoutC_, arch::OpClassSimt, kAccessSizeInBits_, 2, Operator_
+                     > {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<1, 1, 1>;
+  using ElementA = ElementA_;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::RowMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassSimt;
+  static int const PartitionsK = Shape::kK / WarpShape::kK;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    PartitionsK
+  >;
+
+  // Divisility requirements
+  static_assert(
+    !(Shape::kM % WarpShape::kM) &&
+    !(Shape::kN % WarpShape::kN),
+    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
+  );
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassSimt>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  static int const kElementsPerAccessDefault = 1;
+  static_assert(kAccessSizeInBits_ == -1 ||
+          sizeof_bits<ElementA>::value == sizeof_bits<ElementB>::value ||
+          kAccessSizeInBits_ / sizeof_bits<ElementA>::value == kElementsPerAccessDefault,
+          "Non-default value for kAccessSizeInBits_ is only allowed if size(elementA) == sizeof(elementB)");
+  static int const kElementsPerAccess = (kAccessSizeInBits_ != -1) ? kAccessSizeInBits_ / sizeof_bits<ElementA>::value : kElementsPerAccessDefault;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajor;
+  using SmemLayoutB = layout::RowMajor;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kM, Shape::kK>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kM, Shape::kK>, 
+    ElementA, 
+    SmemLayoutA,
+    1,
+    IteratorThreadMapA
+  >;
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kN, Shape::kK>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kK, Shape::kN>, 
+    ElementB, 
+    SmemLayoutB,
+    0,
+    IteratorThreadMapB
+  >;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level op
+  static const int WarpNumThreadsM = detail::simt_get_warp_threads_m<WarpShape>();
+  static const int WarpNumThreadsN = kWarpSize / WarpNumThreadsM;
+  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
+  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
+  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
+      "WarpShape must be divisible by ThreadTile shape.");
+  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
+  static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
+  static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
+  static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM);
+  static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN);
+  // these should have max of thread tile also
+  using LaneMmaShape = cutlass::gemm::GemmShape<
+      LaneM,
+      LaneN,
+      1>;
+  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
+      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
+      cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
+      LaneMmaShape
+  >;
+
+  using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
+    WarpShape,    /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
+    ElementA,     /// Data type of A elements
+    SmemLayoutA,  /// Layout of A matrix (concept: MatrixLayout)
+    ElementB,     /// Data type of B elements
+    SmemLayoutB,  /// Layout of B matrix (concept: MatrixLayout)
+    ElementC,     /// Element type of C matrix
+    LayoutC,      /// Layout of C matrix (concept: MatrixLayout)
+    Policy        /// Policy describing warp-level MmaSimtOp (concept: MmaSimtOp policy)
+    >;            /// Used for partial specialization
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<
+    MmaWarpSimt,
+    MatrixShape<0, 0>,
+    MatrixShape<0, 0>,
+    WarpCount::kK
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
diff --git a/include/cutlass/gemm/threadblock/default_mma_core_with_reduction.h b/include/cutlass/gemm/threadblock/default_mma_core_with_reduction.h
index 21778660..57be0c3a 100644
--- a/include/cutlass/gemm/threadblock/default_mma_core_with_reduction.h
+++ b/include/cutlass/gemm/threadblock/default_mma_core_with_reduction.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/threadblock/default_mma_core_wmma.h b/include/cutlass/gemm/threadblock/default_mma_core_wmma.h
index 981f2287..bab044e2 100644
--- a/include/cutlass/gemm/threadblock/default_mma_core_wmma.h
+++ b/include/cutlass/gemm/threadblock/default_mma_core_wmma.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/threadblock/default_mma_planar_complex_multistage.h b/include/cutlass/gemm/threadblock/default_mma_planar_complex_multistage.h
index 2afda1fd..e03e9bea 100644
--- a/include/cutlass/gemm/threadblock/default_mma_planar_complex_multistage.h
+++ b/include/cutlass/gemm/threadblock/default_mma_planar_complex_multistage.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/threadblock/default_mma_planar_complex_pipelined.h b/include/cutlass/gemm/threadblock/default_mma_planar_complex_pipelined.h
index 43c2d9ff..67be56fd 100644
--- a/include/cutlass/gemm/threadblock/default_mma_planar_complex_pipelined.h
+++ b/include/cutlass/gemm/threadblock/default_mma_planar_complex_pipelined.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *   * Redistributions of source code must retain the above copyright notice, this list of
- *     conditions and the following disclaimer.
- *   * Redistributions in binary form must reproduce the above copyright notice, this list of
- *     conditions and the following disclaimer in the documentation and/or other materials
- *     provided with the distribution.
- *   * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *     to endorse or promote products derived from this software without specific prior written
- *     permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/threadblock/default_mma_with_reduction.h b/include/cutlass/gemm/threadblock/default_mma_with_reduction.h
index e98475c7..65f38f2f 100644
--- a/include/cutlass/gemm/threadblock/default_mma_with_reduction.h
+++ b/include/cutlass/gemm/threadblock/default_mma_with_reduction.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/threadblock/default_multistage_mma_complex.h b/include/cutlass/gemm/threadblock/default_multistage_mma_complex.h
index f8a3e544..721959dd 100644
--- a/include/cutlass/gemm/threadblock/default_multistage_mma_complex.h
+++ b/include/cutlass/gemm/threadblock/default_multistage_mma_complex.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core.h b/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core.h
index 2838eb8b..eda6baf8 100644
--- a/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core.h
+++ b/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h b/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h
index 45a068cd..8e2d5ce0 100644
--- a/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h
+++ b/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/threadblock/default_multistage_trmm_complex.h b/include/cutlass/gemm/threadblock/default_multistage_trmm_complex.h
new file mode 100644
index 00000000..4cca4459
--- /dev/null
+++ b/include/cutlass/gemm/threadblock/default_multistage_trmm_complex.h
@@ -0,0 +1,556 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Template for a multistage GEMM kernel. Does not compute batching or support split-K.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator_triangular_matrix.h"
+#include "cutlass/gemm/threadblock/mma_blas3_multistage.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Side Mode for the kernel
+    SideMode kSideMode,
+    /// Fill Mode for the triangular matrix
+    FillMode kFillMode,
+    /// Diag Type for the triangular matrix
+    DiagType kDiagType,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB = ComplexTransform::kNone,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator = arch::OpMultiplyAddComplex,
+    /// Blas3 computation mode
+    BlasMode BlasMode_ = BlasMode::kTriangular,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor = false>
+struct DefaultMultistageTrmmComplex;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Side Mode for the kernel
+    SideMode kSideMode,
+    /// Fill Mode for the triangular matrix
+    FillMode kFillMode,
+    /// Diag Type for the triangular matrix
+    DiagType kDiagType,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator>
+struct DefaultMultistageTrmmComplex<ElementA, LayoutA, ElementB, LayoutB,
+                            kSideMode, kFillMode, kDiagType,
+                            ElementAccumulator, layout::RowMajor, OperatorClass, ArchTag, ThreadblockShape, WarpShape,
+                            InstructionShape, Stages, TransformA, TransformB, Operator> {
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplexCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA, 
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, OperatorClass,
+      Stages, TransformA, TransformB, Operator>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, ThreadMapA::kElementsPerAccess>;
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, 
+          kSideMode, kFillMode, kDiagType, 
+          AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, ThreadMapB::kElementsPerAccess>;
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, 
+          kSideMode, FillMode::kFull, DiagType::kInvalid,
+          AccessTypeB>;
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaMultistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
+      typename MmaCore::MmaPolicy, Stages, SharedMemoryClearOption::kZfill>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output and right-side mode
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Fill Mode for the triangular matrix
+    FillMode kFillMode,
+    /// Diag Type for the triangular matrix
+    DiagType kDiagType,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator>
+struct DefaultMultistageTrmmComplex<ElementA, LayoutA, ElementB, LayoutB,
+                            SideMode::kRight, kFillMode, kDiagType,
+                            ElementAccumulator, layout::RowMajor, OperatorClass, ArchTag, ThreadblockShape, WarpShape,
+                            InstructionShape, Stages, TransformA, TransformB, Operator> {
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplexCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA, 
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, OperatorClass,
+      Stages, TransformA, TransformB, Operator>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, ThreadMapA::kElementsPerAccess>;
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, 
+          SideMode::kRight, FillMode::kFull, DiagType::kInvalid, 
+          AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, ThreadMapB::kElementsPerAccess>;
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, 
+          SideMode::kRight, kFillMode, kDiagType,
+          AccessTypeB>;
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaMultistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
+      typename MmaCore::MmaPolicy, Stages, SharedMemoryClearOption::kZfill>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output with unit diagonal
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Side Mode for the kernel
+    SideMode kSideMode,
+    /// Fill Mode for the triangular matrix
+    FillMode kFillMode,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator>
+struct DefaultMultistageTrmmComplex<ElementA, LayoutA, ElementB, LayoutB,
+                            kSideMode, kFillMode, DiagType::kUnit,
+                            ElementAccumulator, layout::RowMajor, OperatorClass, ArchTag, ThreadblockShape, WarpShape,
+                            InstructionShape, Stages, TransformA, TransformB, Operator> {
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplexCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA, 
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, OperatorClass,
+      Stages, TransformA, TransformB, Operator>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, ThreadMapA::kElementsPerAccess>;
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, 
+          kSideMode, kFillMode, DiagType::kUnit, 
+          AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, ThreadMapB::kElementsPerAccess>;
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, 
+          kSideMode, FillMode::kFull, DiagType::kInvalid,
+          AccessTypeB>;
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaBlas3Multistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
+      typename MmaCore::MmaPolicy, Stages, SharedMemoryClearOption::kZfill>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output and right-side mode, unit diagonal
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Fill Mode for the triangular matrix
+    FillMode kFillMode,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator>
+struct DefaultMultistageTrmmComplex<ElementA, LayoutA, ElementB, LayoutB,
+                            SideMode::kRight, kFillMode, DiagType::kUnit,
+                            ElementAccumulator, layout::RowMajor, OperatorClass, ArchTag, ThreadblockShape, WarpShape,
+                            InstructionShape, Stages, TransformA, TransformB, Operator> {
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplexCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA, 
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, OperatorClass,
+      Stages, TransformA, TransformB, Operator>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, ThreadMapA::kElementsPerAccess>;
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, 
+          SideMode::kRight, FillMode::kFull, DiagType::kInvalid, 
+          AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, ThreadMapB::kElementsPerAccess>;
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, 
+          SideMode::kRight, kFillMode, DiagType::kUnit,
+          AccessTypeB>;
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaBlas3Multistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
+      typename MmaCore::MmaPolicy, Stages, SharedMemoryClearOption::kZfill>;
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output (for TRMM where diagonal imag part is ignored - used by HEMM)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Side Mode for the kernel
+    SideMode kSideMode,
+    /// Fill Mode for the triangular matrix
+    FillMode kFillMode,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator>
+struct DefaultMultistageTrmmComplex<ElementA, LayoutA, ElementB, LayoutB,
+                            kSideMode, kFillMode, DiagType::kNonUnit,
+                            ElementAccumulator, layout::RowMajor, OperatorClass, ArchTag, ThreadblockShape, WarpShape,
+                            InstructionShape, Stages, TransformA, TransformB, Operator, BlasMode::kHermitian> {
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplexCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA, 
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, OperatorClass,
+      Stages, TransformA, TransformB, Operator>;
+
+  // Define iterators over tiles from the A operand
+  // PredicatedTileAccessIteratorTriangularMatrix only tracks diagonal elements,
+  // when DiagType is kUnit
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, ThreadMapA::kElementsPerAccess>;
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, 
+          kSideMode, kFillMode, DiagType::kUnit, 
+          AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, ThreadMapB::kElementsPerAccess>;
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, 
+          kSideMode, FillMode::kFull, DiagType::kInvalid,
+          AccessTypeB>;
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaBlas3Multistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
+      typename MmaCore::MmaPolicy, Stages, SharedMemoryClearOption::kZfill,
+      BlasMode::kHermitian>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output and right-side mode (for TRMM where diagonal imag part is ignored - used by HEMM)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Fill Mode for the triangular matrix
+    FillMode kFillMode,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator>
+struct DefaultMultistageTrmmComplex<ElementA, LayoutA, ElementB, LayoutB,
+                            SideMode::kRight, kFillMode, DiagType::kNonUnit,
+                            ElementAccumulator, layout::RowMajor, OperatorClass, ArchTag, ThreadblockShape, WarpShape,
+                            InstructionShape, Stages, TransformA, TransformB, Operator, BlasMode::kHermitian> {
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplexCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA, 
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, OperatorClass,
+      Stages, TransformA, TransformB, Operator>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, ThreadMapA::kElementsPerAccess>;
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, 
+          SideMode::kRight, FillMode::kFull, DiagType::kInvalid, 
+          AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  // PredicatedTileAccessIteratorTriangularMatrix only tracks diagonal elements,
+  // when DiagType is kUnit
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, ThreadMapB::kElementsPerAccess>;
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, 
+          SideMode::kRight, kFillMode, DiagType::kUnit,
+          AccessTypeB>;
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaBlas3Multistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
+      typename MmaCore::MmaPolicy, Stages, SharedMemoryClearOption::kZfill,
+      BlasMode::kHermitian>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/threadblock/default_sparse_mma.h b/include/cutlass/gemm/threadblock/default_sparse_mma.h
index c04625f0..a1c7dcd1 100644
--- a/include/cutlass/gemm/threadblock/default_sparse_mma.h
+++ b/include/cutlass/gemm/threadblock/default_sparse_mma.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/threadblock/default_trmm.h b/include/cutlass/gemm/threadblock/default_trmm.h
new file mode 100644
index 00000000..bfa67ddc
--- /dev/null
+++ b/include/cutlass/gemm/threadblock/default_trmm.h
@@ -0,0 +1,445 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+// 
+/*! \file
+    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/wmma.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator_triangular_matrix.h"
+#include "cutlass/gemm/threadblock/mma_blas3_multistage.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+#include "cutlass/gemm/threadblock/default_mma_core_wmma.h"
+#endif //CUTLASS_ARCH_WMMA_ENABLED
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Side Mode for the kernel
+    SideMode kSideMode,
+    /// Fill Mode for the triangular matrix
+    FillMode kFillMode,
+    /// Diag Type for the triangular matrix
+    DiagType kDiagType,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation perfomed by GEMM
+    typename Operator,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor = false
+    >
+struct DefaultTrmm;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output (OperatorClass TensorOp)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Side Mode for the kernel
+    SideMode kSideMode,
+    /// Fill Mode for the triangular matrix
+    FillMode kFillMode,
+    /// Diag Type for the triangular matrix
+    DiagType kDiagType,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Operation perfomed by GEMM
+    typename Operator
+    >
+struct DefaultTrmm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, 
+                  kSideMode, kFillMode, kDiagType, 
+                  ElementAccumulator, layout::RowMajor,
+                  arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape,
+                  InstructionShape, Stages, Operator, false> {
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpA =
+      ((sizeof_bits<ElementA>::value * kAlignmentA) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * kAlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, Operator, false, CacheOpA, CacheOpB>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
+
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, kSideMode, kFillMode, kDiagType, AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
+
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, kSideMode, FillMode::kFull, DiagType::kInvalid, AccessTypeB>;
+  
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaMultistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
+      typename MmaCore::MmaPolicy, Stages, SharedMemoryClearOption::kZfill>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output, right side mode (OperatorClass TensorOp)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Fill Mode for the triangular matrix
+    FillMode kFillMode,
+    /// Diag Type for the triangular matrix
+    DiagType kDiagType,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Operation perfomed by GEMM
+    typename Operator
+    >
+struct DefaultTrmm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, 
+                  SideMode::kRight, kFillMode, kDiagType, 
+                  ElementAccumulator, layout::RowMajor,
+                  arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape,
+                  InstructionShape, Stages, Operator, false> {
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpA =
+      ((sizeof_bits<ElementA>::value * kAlignmentA) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * kAlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, Operator, false, CacheOpA, CacheOpB>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
+
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, SideMode::kRight, FillMode::kFull, DiagType::kInvalid, AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
+
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, SideMode::kRight, kFillMode, kDiagType, AccessTypeB>;
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaMultistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
+      typename MmaCore::MmaPolicy, Stages, SharedMemoryClearOption::kZfill>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output with unit diagonal (OperatorClass TensorOp)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Side Mode for the kernel
+    SideMode kSideMode,
+    /// Fill Mode for the triangular matrix
+    FillMode kFillMode,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Operation perfomed by GEMM
+    typename Operator
+    >
+struct DefaultTrmm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, 
+                  kSideMode, kFillMode, DiagType::kUnit, 
+                  ElementAccumulator, layout::RowMajor,
+                  arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape,
+                  InstructionShape, Stages, Operator, false> {
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpA =
+      ((sizeof_bits<ElementA>::value * kAlignmentA) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * kAlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, Operator, false, CacheOpA, CacheOpB>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
+
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, kSideMode, kFillMode, DiagType::kUnit, AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
+
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, kSideMode, FillMode::kFull, DiagType::kInvalid, AccessTypeB>;
+  
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaBlas3Multistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
+      typename MmaCore::MmaPolicy, Stages, SharedMemoryClearOption::kZfill>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output, right side mode, unit diagonal (OperatorClass TensorOp)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Fill Mode for the triangular matrix
+    FillMode kFillMode,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Operation perfomed by GEMM
+    typename Operator
+    >
+struct DefaultTrmm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, 
+                  SideMode::kRight, kFillMode, DiagType::kUnit, 
+                  ElementAccumulator, layout::RowMajor,
+                  arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape,
+                  InstructionShape, Stages, Operator, false> {
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpA =
+      ((sizeof_bits<ElementA>::value * kAlignmentA) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * kAlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, Operator, false, CacheOpA, CacheOpB>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
+
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, SideMode::kRight, FillMode::kFull, DiagType::kInvalid, AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
+
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, SideMode::kRight, kFillMode, DiagType::kUnit, AccessTypeB>;
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaBlas3Multistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
+      typename MmaCore::MmaPolicy, Stages, SharedMemoryClearOption::kZfill>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass 
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/threadblock/gemv.h b/include/cutlass/gemm/threadblock/gemv.h
index 38e37d83..74e5e9a8 100755
--- a/include/cutlass/gemm/threadblock/gemv.h
+++ b/include/cutlass/gemm/threadblock/gemv.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/threadblock/mma_base.h b/include/cutlass/gemm/threadblock/mma_base.h
index d34a6adc..0203c9ce 100644
--- a/include/cutlass/gemm/threadblock/mma_base.h
+++ b/include/cutlass/gemm/threadblock/mma_base.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/threadblock/mma_blas3_multistage.h b/include/cutlass/gemm/threadblock/mma_blas3_multistage.h
new file mode 100644
index 00000000..d61442ee
--- /dev/null
+++ b/include/cutlass/gemm/threadblock/mma_blas3_multistage.h
@@ -0,0 +1,706 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+    Used by BLAS3 kernels that need to treat diagonal elements of a input iterator as a special case.
+  
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/threadblock/mma_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kZfill,
+    /// Blas3 computation mode
+    BlasMode BlasMode_ = BlasMode::kTriangular,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaBlas3Multistage : 
+  public MmaBase<Shape_, Policy_, Stages> {
+public:
+  ///< Base class
+  using Base = MmaBase<Shape_, Policy_, Stages>;
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+  ///< Iterates over tiles of A operand in global memory
+  using IteratorA = IteratorA_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB = IteratorB_;
+  ///< Data type of accumulator matrix
+  using ElementC = ElementC_;
+  ///< Layout of accumulator matrix
+  using LayoutC = LayoutC_;
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+  ///< Blas Mode
+  static BlasMode const kBlasMode = BlasMode_;
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of accumulator tile
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Minimum architecture is Sm80 to support cp.async
+  using ArchTag = arch::Sm80;
+  
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = Operator::kTransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = Operator::kTransformB;
+
+  /// Internal structure exposed for introspection.
+  struct Detail {
+
+    static_assert(Base::kWarpGemmIterations > 1,
+                  "The pipelined structure requires at least two warp-level "
+                  "GEMM operations.");
+
+    /// Number of cp.async instructions to load one stage of operand A
+    static int const AsyncCopyIterationsPerStageA =
+        IteratorA::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const AsyncCopyIterationsPerStageB =
+        IteratorB::ThreadMap::Iterations::kCount;
+
+    /// Number of stages
+    static int const kStages = Stages;
+
+    /// Number of cp.async instructions to load on group of operand A
+    static int const kAccessesPerGroupA =
+        (AsyncCopyIterationsPerStageA + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB =
+        (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+  };
+
+ private:
+
+  using WarpLoadedFragmentA = typename Operator::FragmentA;
+  using WarpLoadedFragmentB = typename Operator::FragmentB;
+  using WarpTransformedFragmentA = typename Operator::TransformedFragmentA;
+  using WarpTransformedFragmentB = typename Operator::TransformedFragmentB;
+
+ private:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  MmaBlas3Multistage(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::SharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx
+    ):
+      Base(shared_storage, thread_idx, warp_idx, lane_idx),
+      smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
+      smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx)
+  {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset(
+        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset(
+        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+  }
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance(IteratorA &iterator_A, IteratorB &iterator_B,
+                              int group_start_A = 0, int group_start_B = 0) {
+    iterator_A.set_iteration_index(group_start_A *
+                                   IteratorA::kAccessesPerVector);
+    this->smem_iterator_A_.set_iteration_index(group_start_A);
+
+    // Async Copy for operand A
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) {
+      if (group_start_A + j < Detail::AsyncCopyIterationsPerStageA) {
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(
+                this->smem_iterator_A_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
+                              IteratorA::ThreadMap::kElementsPerAccess /
+                              IteratorA::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_A.get();
+          bool isvalid = iterator_A.valid();
+
+          if (isvalid && iterator_A.getOnDiag()) {
+            // Elements that are on diagonal
+            if (kBlasMode == BlasMode::kHermitian && cutlass::is_complex<typename IteratorA::Element>::value) {
+              /* Copy real part from gmem, write zero for imag part in smem */
+              /* The following logic to determine kSizeRealBytes is so that compiler doesn't complain when
+               * compiling for not complex datatype and using half the size for cp_async_zfill */
+              int const kSizeRealBytes = (platform::is_same<typename IteratorA::Element,
+                                          complex<double>>::value) ? 8 : 4;
+              cutlass::arch::cp_async_zfill<kSizeRealBytes, cutlass::arch::CacheOperation::Always>(
+                dst_ptr + v, gmem_ptr, true);
+              cutlass::arch::cp_async_diag<typename IteratorA::Element, true>(
+                reinterpret_cast<char *> (dst_ptr + v) + kSizeRealBytes);
+            } else {
+              /* Write one (1) directly to smem*/
+              cutlass::arch::cp_async_diag<typename IteratorA::Element>(dst_ptr + v);
+            }
+          } else {
+            // Elements that are not of diagonal
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+                dst_ptr + v, gmem_ptr, isvalid);
+          }
+
+          ++iterator_A;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+    }
+
+    iterator_B.set_iteration_index(group_start_B *
+                                   IteratorB::kAccessesPerVector);
+    this->smem_iterator_B_.set_iteration_index(group_start_B);
+
+    // Async Copy for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) {
+      if (group_start_B + j < Detail::AsyncCopyIterationsPerStageB) {
+        typename IteratorB::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType *>(
+                this->smem_iterator_B_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value *
+                              IteratorB::ThreadMap::kElementsPerAccess /
+                              IteratorB::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_B.get();
+          bool isvalid = iterator_B.valid();
+
+          if (isvalid && iterator_B.getOnDiag()) {
+            // Elements that are on diagonal
+            if (kBlasMode == BlasMode::kHermitian && cutlass::is_complex<typename IteratorB::Element>::value) {
+              /* Copy real part from gmem, write zero for imag part in smem */
+              int const kSizeRealBytes = (platform::is_same<typename IteratorB::Element,
+                                          complex<double>>::value) ? 8 : 4;
+              cutlass::arch::cp_async_zfill<kSizeRealBytes, cutlass::arch::CacheOperation::Always>(
+                dst_ptr + v, gmem_ptr, true);
+              cutlass::arch::cp_async_diag<typename IteratorB::Element, true>(
+                reinterpret_cast<char *> (dst_ptr + v) + kSizeRealBytes);
+            } else {
+              /* Write one (1) directly to smem*/
+              cutlass::arch::cp_async_diag<typename IteratorB::Element>(dst_ptr + v);
+            }
+          } else {
+            // Elements that are not of diagonal
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+                dst_ptr + v, gmem_ptr, isvalid);
+          }
+
+          ++iterator_B;
+        }
+        ++this->smem_iterator_B_;
+      }
+    }
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations,
+      ///< destination accumulator tile
+      FragmentC &accum,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B,
+      ///< initial value of accumulator
+      FragmentC const &src_accum) {
+
+    //
+    // Prologue
+    //
+
+    // Issue several complete stages
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < Base::kStages - 1;
+         ++stage, --gemm_k_iterations) {
+
+      iterator_A.clear_mask(gemm_k_iterations == 0);
+      iterator_B.clear_mask(gemm_k_iterations == 0);
+
+      iterator_A.set_iteration_index(0);
+      this->smem_iterator_A_.set_iteration_index(0);
+
+      // Async Copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(
+                this->smem_iterator_A_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorA::Element>::value *
+              IteratorA::ThreadMap::kElementsPerAccess /
+              IteratorA::kAccessesPerVector / 8;
+
+          auto gmem_ptr = iterator_A.get();
+          bool isvalid = iterator_A.valid();
+
+          if (isvalid && iterator_A.getOnDiag()) {
+            // Elements that are on diagonal
+            if (kBlasMode == BlasMode::kHermitian && cutlass::is_complex<typename IteratorA::Element>::value) {
+              /* Copy real part from gmem, write zero for imag part in smem */
+              int const kSizeRealBytes = (platform::is_same<typename IteratorA::Element,
+                                          complex<double>>::value) ? 8 : 4;
+              cutlass::arch::cp_async_zfill<kSizeRealBytes, cutlass::arch::CacheOperation::Always>(
+                dst_ptr + v, gmem_ptr, true);
+              cutlass::arch::cp_async_diag<typename IteratorA::Element, true>(
+                reinterpret_cast<char *> (dst_ptr + v) + kSizeRealBytes);
+            } else {
+              /* Write one (1) directly to smem*/
+              cutlass::arch::cp_async_diag<typename IteratorA::Element>(dst_ptr + v);
+            }
+          } else {
+            // Elements that are not of diagonal
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+                dst_ptr + v, gmem_ptr, isvalid);
+          }
+
+          ++iterator_A;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+
+      iterator_B.set_iteration_index(0);
+      this->smem_iterator_B_.set_iteration_index(0);
+
+      // Async Copy for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
+        typename IteratorB::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType *>(
+                this->smem_iterator_B_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorB::Element>::value *
+              IteratorB::ThreadMap::kElementsPerAccess /
+              IteratorB::kAccessesPerVector / 8;
+
+          auto gmem_ptr = iterator_B.get();
+          bool isvalid = iterator_B.valid();
+
+          if (isvalid && iterator_B.getOnDiag()) {
+            // Elements that are on diagonal
+            if (kBlasMode == BlasMode::kHermitian && cutlass::is_complex<typename IteratorB::Element>::value) {
+              /* Copy real part from gmem, write zero for imag part in smem */
+              int const kSizeRealBytes = (platform::is_same<typename IteratorB::Element,
+                                          complex<double>>::value) ? 8 : 4;
+              cutlass::arch::cp_async_zfill<kSizeRealBytes, cutlass::arch::CacheOperation::Always>(
+                dst_ptr + v, gmem_ptr, true);
+              cutlass::arch::cp_async_diag<typename IteratorB::Element, true>(
+                reinterpret_cast<char *> (dst_ptr + v) + kSizeRealBytes);
+            } else {
+              /* Write one (1) directly to smem*/
+              cutlass::arch::cp_async_diag<typename IteratorB::Element>(dst_ptr + v);
+            }
+          } else {
+            // Elements that are not of diagonal
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+                dst_ptr + v, gmem_ptr, isvalid);
+          }
+
+          ++iterator_B;
+        }
+
+        ++this->smem_iterator_B_;
+      }
+
+      // Move to the next stage
+      iterator_A.add_tile_offset({0, 1});
+      iterator_B.add_tile_offset({1, 0});
+
+      this->smem_iterator_A_.add_tile_offset({0, 1});
+      this->smem_iterator_B_.add_tile_offset({1, 0});
+
+      // Defines the boundary of a stage of cp.async.
+      cutlass::arch::cp_async_fence();
+    }
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    //
+    // Clear the remaining tiles of SMEM. This is a functional requirement for some kernels
+    // so that all accumulator elements outside the GEMM footprint are zero.
+    //
+
+    if (SharedMemoryClear == SharedMemoryClearOption::kClearLastStage) {
+
+      /// Iterator to write threadblock-scoped tile of A operand to shared memory
+      SmemIteratorA last_smem_iterator_A(this->smem_iterator_A_);
+
+      typename IteratorA::AccessType zero_A;
+      zero_A.clear();
+
+      last_smem_iterator_A.set_iteration_index(0);
+
+      // Async Copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
+
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(
+                last_smem_iterator_A.get());
+
+        *dst_ptr = zero_A;
+
+        ++last_smem_iterator_A;
+      }
+
+      /// Iterator to write threadblock-scoped tile of B operand to shared memory
+      SmemIteratorB last_smem_iterator_B(this->smem_iterator_B_);
+      typename IteratorB::AccessType zero_B;
+
+      zero_B.clear();
+      last_smem_iterator_B.set_iteration_index(0);
+
+      // Async Copy for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
+
+        typename IteratorB::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType *>(
+                last_smem_iterator_B.get());
+
+        *dst_ptr = zero_B;
+
+        ++last_smem_iterator_B;
+      }
+    }
+
+    // Waits until kStages-2 stages have committed.
+    cutlass::arch::cp_async_wait<Base::kStages - 2>();
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpLoadedFragmentA warp_loaded_frag_A[2];
+    WarpLoadedFragmentB warp_loaded_frag_B[2];
+    WarpTransformedFragmentA warp_transformed_frag_A[2];
+    WarpTransformedFragmentB warp_transformed_frag_B[2];
+
+    Operator warp_mma;
+
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_loaded_frag_A[0]);
+    this->warp_tile_iterator_B_.load(warp_loaded_frag_B[0]);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_B_;
+
+    iterator_A.clear_mask(gemm_k_iterations == 0);
+    iterator_B.clear_mask(gemm_k_iterations == 0);
+
+    int smem_write_stage_idx = Base::kStages - 1;
+    int smem_read_stage_idx = 0;
+
+    warp_mma.transform(warp_transformed_frag_A[0], warp_transformed_frag_B[0],
+                       warp_loaded_frag_A[0], warp_loaded_frag_B[0]);
+
+    // tf32x3 kernels use staging accumulation. warp_mma uses a temporary
+    // accumulator and this temporary accumulator is added to the final
+    // accumulator once in every mainloop iteration.
+    plus<FragmentC> plus_accum;
+
+    FragmentC tmp_accum;
+
+    if (platform::is_same<typename Operator::MathOperator,
+                          arch::OpMultiplyAddFastF32>::value
+      || platform::is_same<typename Operator::MathOperator,
+                           arch::OpMultiplyAddComplexFastF32>::value) {
+
+      tmp_accum.clear();
+    }
+
+    //
+    // Mainloop
+    //
+
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > (-Base::kStages + 1);) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations;
+           ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+
+        this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        
+        this->warp_tile_iterator_A_.load(warp_loaded_frag_A[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_B_.load(warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+
+        ++this->warp_tile_iterator_A_;
+        ++this->warp_tile_iterator_B_;
+
+        if (warp_mma_k > 0)
+          warp_mma.transform(warp_transformed_frag_A[warp_mma_k % 2],
+                             warp_transformed_frag_B[warp_mma_k % 2],
+                             warp_loaded_frag_A[warp_mma_k % 2],
+                             warp_loaded_frag_B[warp_mma_k % 2]);
+
+        if (platform::is_same<typename Operator::MathOperator,
+                              arch::OpMultiplyAddFastF32>::value
+          || platform::is_same<typename Operator::MathOperator,
+                               arch::OpMultiplyAddComplexFastF32>::value) {
+
+          warp_mma(
+            tmp_accum, 
+            warp_transformed_frag_A[warp_mma_k % 2],
+            warp_transformed_frag_B[warp_mma_k % 2], 
+            tmp_accum
+          );
+
+          if (warp_mma_k == 0) {
+            accum = plus_accum(accum, tmp_accum);
+            tmp_accum.clear();
+          }
+        } else {
+          warp_mma(
+            accum, 
+            warp_transformed_frag_A[warp_mma_k % 2],
+            warp_transformed_frag_B[warp_mma_k % 2], 
+            accum
+          );
+        }
+
+        // Issue global->shared copies for the this stage
+        if (warp_mma_k < Base::kWarpGemmIterations - 1) {
+          int group_start_iteration_A, group_start_iteration_B;
+
+          group_start_iteration_A = warp_mma_k * Detail::kAccessesPerGroupA;
+          group_start_iteration_B = warp_mma_k * Detail::kAccessesPerGroupB;
+
+          copy_tiles_and_advance(iterator_A, iterator_B, group_start_iteration_A, 
+                               group_start_iteration_B);
+        }
+
+        if (warp_mma_k + 2 == Base::kWarpGemmIterations) {
+          int group_start_iteration_A, group_start_iteration_B;
+          group_start_iteration_A =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupA;
+          group_start_iteration_B =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupB;
+
+          copy_tiles_and_advance(iterator_A, iterator_B, group_start_iteration_A, 
+                               group_start_iteration_B);
+
+          // Inserts a memory fence between stages of cp.async instructions.
+          cutlass::arch::cp_async_fence();
+
+          // Waits until kStages-2 stages have committed.
+          arch::cp_async_wait<Base::kStages - 2>();
+          __syncthreads();
+
+          // Move to the next stage
+          iterator_A.add_tile_offset({0, 1});
+          iterator_B.add_tile_offset({1, 0});
+
+          this->smem_iterator_A_.add_tile_offset({0, 1});
+          this->smem_iterator_B_.add_tile_offset({1, 0});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (smem_write_stage_idx == (Base::kStages - 1)) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+            smem_write_stage_idx = 0;
+          } else {
+            ++smem_write_stage_idx;
+          }
+
+          if (smem_read_stage_idx == (Base::kStages - 1)) {
+            this->warp_tile_iterator_A_.add_tile_offset(
+                {0, -Base::kStages * Policy::kPartitionsK *
+                        Base::kWarpGemmIterations});
+            this->warp_tile_iterator_B_.add_tile_offset(
+                {-Base::kStages * Policy::kPartitionsK *
+                     Base::kWarpGemmIterations,
+                 0});
+            smem_read_stage_idx = 0;
+          } else {
+            ++smem_read_stage_idx;
+          }
+
+          --gemm_k_iterations;
+          iterator_A.clear_mask(gemm_k_iterations == 0);
+          iterator_B.clear_mask(gemm_k_iterations == 0);
+        }
+
+        // Do any conversions feeding the first stage at the end of the loop so
+        // we can start right away on mma instructions
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations)
+          warp_mma.transform(warp_transformed_frag_A[(warp_mma_k + 1) % 2],
+                             warp_transformed_frag_B[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_A[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+      }
+
+    }
+
+    if (platform::is_same<typename Operator::MathOperator,
+                          arch::OpMultiplyAddFastF32>::value
+      || platform::is_same<typename Operator::MathOperator,
+                           arch::OpMultiplyAddComplexFastF32>::value) {
+      accum = plus_accum(accum, tmp_accum); 
+    }
+ 
+    if (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
+      // commit and drain all pending and predicated LDGSTS pnz from the GEMM mainloop
+      cutlass::arch::cp_async_fence();
+      cutlass::arch::cp_async_wait<0>();
+      __syncthreads();
+    }
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/threadblock/mma_multistage.h b/include/cutlass/gemm/threadblock/mma_multistage.h
index 436e01cb..a3041ea3 100644
--- a/include/cutlass/gemm/threadblock/mma_multistage.h
+++ b/include/cutlass/gemm/threadblock/mma_multistage.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/threadblock/mma_pipelined.h b/include/cutlass/gemm/threadblock/mma_pipelined.h
index 76513ee3..05ab53e2 100644
--- a/include/cutlass/gemm/threadblock/mma_pipelined.h
+++ b/include/cutlass/gemm/threadblock/mma_pipelined.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/threadblock/mma_planar_complex_base.h b/include/cutlass/gemm/threadblock/mma_planar_complex_base.h
index cf276f17..77e9e8d3 100644
--- a/include/cutlass/gemm/threadblock/mma_planar_complex_base.h
+++ b/include/cutlass/gemm/threadblock/mma_planar_complex_base.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/threadblock/mma_planar_complex_multistage.h b/include/cutlass/gemm/threadblock/mma_planar_complex_multistage.h
index 47cdf3b1..6b3455dd 100644
--- a/include/cutlass/gemm/threadblock/mma_planar_complex_multistage.h
+++ b/include/cutlass/gemm/threadblock/mma_planar_complex_multistage.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/threadblock/mma_planar_complex_pipelined.h b/include/cutlass/gemm/threadblock/mma_planar_complex_pipelined.h
index d2da8e6d..43d75330 100644
--- a/include/cutlass/gemm/threadblock/mma_planar_complex_pipelined.h
+++ b/include/cutlass/gemm/threadblock/mma_planar_complex_pipelined.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/threadblock/mma_singlestage.h b/include/cutlass/gemm/threadblock/mma_singlestage.h
index 57aaf731..c35475a9 100644
--- a/include/cutlass/gemm/threadblock/mma_singlestage.h
+++ b/include/cutlass/gemm/threadblock/mma_singlestage.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/threadblock/mma_sparse_base.h b/include/cutlass/gemm/threadblock/mma_sparse_base.h
index a86cbb6c..91790918 100644
--- a/include/cutlass/gemm/threadblock/mma_sparse_base.h
+++ b/include/cutlass/gemm/threadblock/mma_sparse_base.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/threadblock/mma_sparse_multistage.h b/include/cutlass/gemm/threadblock/mma_sparse_multistage.h
index a92c13ca..a839e218 100644
--- a/include/cutlass/gemm/threadblock/mma_sparse_multistage.h
+++ b/include/cutlass/gemm/threadblock/mma_sparse_multistage.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/threadblock/mma_with_reduction_multistage.h b/include/cutlass/gemm/threadblock/mma_with_reduction_multistage.h
index 6159db44..2bd7f765 100644
--- a/include/cutlass/gemm/threadblock/mma_with_reduction_multistage.h
+++ b/include/cutlass/gemm/threadblock/mma_with_reduction_multistage.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/threadblock/threadblock_swizzle.h b/include/cutlass/gemm/threadblock/threadblock_swizzle.h
index 70abf988..7f47a08b 100644
--- a/include/cutlass/gemm/threadblock/threadblock_swizzle.h
+++ b/include/cutlass/gemm/threadblock/threadblock_swizzle.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/warp/default_mma_complex_tensor_op.h b/include/cutlass/gemm/warp/default_mma_complex_tensor_op.h
index d9f2010e..c57053cc 100644
--- a/include/cutlass/gemm/warp/default_mma_complex_tensor_op.h
+++ b/include/cutlass/gemm/warp/default_mma_complex_tensor_op.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/warp/default_mma_sparse_tensor_op.h b/include/cutlass/gemm/warp/default_mma_sparse_tensor_op.h
index 6873be99..65e5571c 100644
--- a/include/cutlass/gemm/warp/default_mma_sparse_tensor_op.h
+++ b/include/cutlass/gemm/warp/default_mma_sparse_tensor_op.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/warp/default_mma_tensor_op.h b/include/cutlass/gemm/warp/default_mma_tensor_op.h
index b776990c..b0add9d8 100644
--- a/include/cutlass/gemm/warp/default_mma_tensor_op.h
+++ b/include/cutlass/gemm/warp/default_mma_tensor_op.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/warp/default_mma_tensor_op_sm80.h b/include/cutlass/gemm/warp/default_mma_tensor_op_sm80.h
index 479a0046..cf92aa84 100644
--- a/include/cutlass/gemm/warp/default_mma_tensor_op_sm80.h
+++ b/include/cutlass/gemm/warp/default_mma_tensor_op_sm80.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/warp/default_mma_with_reduction_tensor_op.h b/include/cutlass/gemm/warp/default_mma_with_reduction_tensor_op.h
index e94c1ae9..ace2d0f8 100644
--- a/include/cutlass/gemm/warp/default_mma_with_reduction_tensor_op.h
+++ b/include/cutlass/gemm/warp/default_mma_with_reduction_tensor_op.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/warp/default_mma_wmma_tensor_op.h b/include/cutlass/gemm/warp/default_mma_wmma_tensor_op.h
index f15e1916..2ce20a47 100644
--- a/include/cutlass/gemm/warp/default_mma_wmma_tensor_op.h
+++ b/include/cutlass/gemm/warp/default_mma_wmma_tensor_op.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/warp/mma.h b/include/cutlass/gemm/warp/mma.h
index c61d3022..49e14c64 100644
--- a/include/cutlass/gemm/warp/mma.h
+++ b/include/cutlass/gemm/warp/mma.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/warp/mma_complex_tensor_op.h b/include/cutlass/gemm/warp/mma_complex_tensor_op.h
index 2467e8f3..c524e1d5 100644
--- a/include/cutlass/gemm/warp/mma_complex_tensor_op.h
+++ b/include/cutlass/gemm/warp/mma_complex_tensor_op.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/warp/mma_complex_tensor_op_fast_f32.h b/include/cutlass/gemm/warp/mma_complex_tensor_op_fast_f32.h
index 666b00d5..99ae1964 100644
--- a/include/cutlass/gemm/warp/mma_complex_tensor_op_fast_f32.h
+++ b/include/cutlass/gemm/warp/mma_complex_tensor_op_fast_f32.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h b/include/cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h
index 78565cca..db488551 100644
--- a/include/cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h
+++ b/include/cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op.h b/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op.h
index e1c94f7b..4638fecc 100644
--- a/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op.h
+++ b/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op_tile_iterator_sm80.h b/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op_tile_iterator_sm80.h
index d1a7922c..6d3b9e1c 100644
--- a/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op_tile_iterator_sm80.h
+++ b/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op_tile_iterator_sm80.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/warp/mma_planar_complex.h b/include/cutlass/gemm/warp/mma_planar_complex.h
index 55abfa95..bd43e0b3 100644
--- a/include/cutlass/gemm/warp/mma_planar_complex.h
+++ b/include/cutlass/gemm/warp/mma_planar_complex.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/warp/mma_simt.h b/include/cutlass/gemm/warp/mma_simt.h
index 8f2182fb..8fadccc4 100644
--- a/include/cutlass/gemm/warp/mma_simt.h
+++ b/include/cutlass/gemm/warp/mma_simt.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/warp/mma_simt_policy.h b/include/cutlass/gemm/warp/mma_simt_policy.h
index 365a8ec7..44b1185a 100644
--- a/include/cutlass/gemm/warp/mma_simt_policy.h
+++ b/include/cutlass/gemm/warp/mma_simt_policy.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/warp/mma_simt_tile_iterator.h b/include/cutlass/gemm/warp/mma_simt_tile_iterator.h
index 727b4949..668c309d 100644
--- a/include/cutlass/gemm/warp/mma_simt_tile_iterator.h
+++ b/include/cutlass/gemm/warp/mma_simt_tile_iterator.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/warp/mma_sparse_tensor_op.h b/include/cutlass/gemm/warp/mma_sparse_tensor_op.h
index ab4a16a9..be5b86e3 100644
--- a/include/cutlass/gemm/warp/mma_sparse_tensor_op.h
+++ b/include/cutlass/gemm/warp/mma_sparse_tensor_op.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/warp/mma_tensor_op.h b/include/cutlass/gemm/warp/mma_tensor_op.h
index 1eb61fef..2c62cd07 100644
--- a/include/cutlass/gemm/warp/mma_tensor_op.h
+++ b/include/cutlass/gemm/warp/mma_tensor_op.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/warp/mma_tensor_op_fast_f32.h b/include/cutlass/gemm/warp/mma_tensor_op_fast_f32.h
index bbaf39a7..30886ca4 100644
--- a/include/cutlass/gemm/warp/mma_tensor_op_fast_f32.h
+++ b/include/cutlass/gemm/warp/mma_tensor_op_fast_f32.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/warp/mma_tensor_op_policy.h b/include/cutlass/gemm/warp/mma_tensor_op_policy.h
index d67141d5..4a4cdf51 100644
--- a/include/cutlass/gemm/warp/mma_tensor_op_policy.h
+++ b/include/cutlass/gemm/warp/mma_tensor_op_policy.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/warp/mma_tensor_op_sm70.h b/include/cutlass/gemm/warp/mma_tensor_op_sm70.h
index 9bf40d75..f847c325 100644
--- a/include/cutlass/gemm/warp/mma_tensor_op_sm70.h
+++ b/include/cutlass/gemm/warp/mma_tensor_op_sm70.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator.h b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator.h
index 11c6e1c4..ebce4ed3 100644
--- a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator.h
+++ b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h
index d00f2b05..a7f03105 100644
--- a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h
+++ b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h
index 662cc3c2..4e3e5273 100644
--- a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h
+++ b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sparse.h b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sparse.h
index fff4a38a..d3210775 100644
--- a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sparse.h
+++ b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sparse.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_wmma.h b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_wmma.h
index cd837e79..c26d4781 100644
--- a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_wmma.h
+++ b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_wmma.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/warp/mma_tensor_op_wmma.h b/include/cutlass/gemm/warp/mma_tensor_op_wmma.h
index 5a719217..826d591b 100644
--- a/include/cutlass/gemm/warp/mma_tensor_op_wmma.h
+++ b/include/cutlass/gemm/warp/mma_tensor_op_wmma.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/warp/mma_with_reduction_tensor_op.h b/include/cutlass/gemm/warp/mma_with_reduction_tensor_op.h
index b9bcf632..06a7a994 100644
--- a/include/cutlass/gemm/warp/mma_with_reduction_tensor_op.h
+++ b/include/cutlass/gemm/warp/mma_with_reduction_tensor_op.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/gemm/warp/tile_iterator_planar_complex.h b/include/cutlass/gemm/warp/tile_iterator_planar_complex.h
index 236d1768..a808fd78 100644
--- a/include/cutlass/gemm/warp/tile_iterator_planar_complex.h
+++ b/include/cutlass/gemm/warp/tile_iterator_planar_complex.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/half.h b/include/cutlass/half.h
index a182af12..13d7146f 100644
--- a/include/cutlass/half.h
+++ b/include/cutlass/half.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/integer_subbyte.h b/include/cutlass/integer_subbyte.h
index 38ebc816..981c4cbd 100644
--- a/include/cutlass/integer_subbyte.h
+++ b/include/cutlass/integer_subbyte.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/kernel_launch.h b/include/cutlass/kernel_launch.h
index cd7fb505..16f906a5 100644
--- a/include/cutlass/kernel_launch.h
+++ b/include/cutlass/kernel_launch.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *   * Redistributions of source code must retain the above copyright notice, this list of
- *     conditions and the following disclaimer.
- *   * Redistributions in binary form must reproduce the above copyright notice, this list of
- *     conditions and the following disclaimer in the documentation and/or other materials
- *     provided with the distribution.
- *   * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *     to endorse or promote products derived from this software without specific prior written
- *     permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/layout/layout.h b/include/cutlass/layout/layout.h
index 6c7b207d..8a676b01 100644
--- a/include/cutlass/layout/layout.h
+++ b/include/cutlass/layout/layout.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/layout/matrix.h b/include/cutlass/layout/matrix.h
index d67c58c8..12473f9c 100644
--- a/include/cutlass/layout/matrix.h
+++ b/include/cutlass/layout/matrix.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/layout/pitch_linear.h b/include/cutlass/layout/pitch_linear.h
index 781ad68e..ee3f39b4 100644
--- a/include/cutlass/layout/pitch_linear.h
+++ b/include/cutlass/layout/pitch_linear.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/layout/tensor.h b/include/cutlass/layout/tensor.h
index a0430815..f31df406 100644
--- a/include/cutlass/layout/tensor.h
+++ b/include/cutlass/layout/tensor.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -80,7 +86,7 @@ class TensorNHWC {
   // Data members
   //
 
-  /// Stride data member - [c, wc, hwc]
+  /// Stride data member - [stride_w, stride_h, stride_n]
   Stride stride_;
 
 public:
diff --git a/include/cutlass/layout/tensor_op_multiplicand_sm70.h b/include/cutlass/layout/tensor_op_multiplicand_sm70.h
index eddd1545..3e962151 100644
--- a/include/cutlass/layout/tensor_op_multiplicand_sm70.h
+++ b/include/cutlass/layout/tensor_op_multiplicand_sm70.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/layout/tensor_op_multiplicand_sm75.h b/include/cutlass/layout/tensor_op_multiplicand_sm75.h
index 358084ba..4009ac84 100644
--- a/include/cutlass/layout/tensor_op_multiplicand_sm75.h
+++ b/include/cutlass/layout/tensor_op_multiplicand_sm75.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/layout/tensor_op_multiplicand_sm80.h b/include/cutlass/layout/tensor_op_multiplicand_sm80.h
index ec373d00..33602879 100644
--- a/include/cutlass/layout/tensor_op_multiplicand_sm80.h
+++ b/include/cutlass/layout/tensor_op_multiplicand_sm80.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/layout/vector.h b/include/cutlass/layout/vector.h
index 2bac7e71..77a3b753 100644
--- a/include/cutlass/layout/vector.h
+++ b/include/cutlass/layout/vector.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/matrix.h b/include/cutlass/matrix.h
index 0c4aa7b6..41b3558f 100644
--- a/include/cutlass/matrix.h
+++ b/include/cutlass/matrix.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/matrix_coord.h b/include/cutlass/matrix_coord.h
index 4f7310ec..e6edb382 100644
--- a/include/cutlass/matrix_coord.h
+++ b/include/cutlass/matrix_coord.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/matrix_shape.h b/include/cutlass/matrix_shape.h
index 41f65b02..d1086f0f 100644
--- a/include/cutlass/matrix_shape.h
+++ b/include/cutlass/matrix_shape.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/numeric_conversion.h b/include/cutlass/numeric_conversion.h
index 66722c2d..d7aad5ab 100644
--- a/include/cutlass/numeric_conversion.h
+++ b/include/cutlass/numeric_conversion.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -34,6 +40,7 @@
 
 #include "cutlass/cutlass.h"
 #include "cutlass/numeric_types.h"
+#include "cutlass/transform/thread/unaryOp.h"
 
 #include "cutlass/array.h"
 #include "cutlass/half.h"
@@ -704,7 +711,8 @@ template <
   typename T,
   typename S,
   int N,
-  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest,
+  typename Transform = cutlass::transform::thread::UnaryTransform::Identity
 >
 struct NumericArrayConverter {
 
@@ -712,6 +720,10 @@ struct NumericArrayConverter {
   using source_type = Array<S, N>;
   static FloatRoundStyle const round_style = Round;
 
+  static_assert(platform::is_same<Transform, cutlass::transform::thread::UnaryTransform::Identity>::value ||
+                platform::is_same<Transform, cutlass::transform::thread::UnaryTransform::Conjugate>::value,
+                  "Unary Operator not supported.");
+
   CUTLASS_HOST_DEVICE
   static result_type convert(source_type const & s) {
 
@@ -720,7 +732,12 @@ struct NumericArrayConverter {
 
     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < N; ++i) {
-      result[i] = convert_(s[i]);
+      if( platform::is_same<Transform, cutlass::transform::thread::UnaryTransform::Identity>::value )
+      {
+        result[i] = convert_(s[i]);
+      } else { // conjugate
+        result[i] = conj(convert_(s[i]));
+      }
     }
 
     return result;
@@ -735,17 +752,31 @@ struct NumericArrayConverter {
 template <
   typename T,
   int N,
-  FloatRoundStyle Round
+  FloatRoundStyle Round,
+  typename Transform
 >
-struct NumericArrayConverter<T, T, N, Round> {
+struct NumericArrayConverter<T, T, N, Round, Transform> {
 
   using result_type = Array<T, N>;
   using source_type = Array<T, N>;
   static FloatRoundStyle const round_style = Round;
 
+  static_assert(platform::is_same<Transform, cutlass::transform::thread::UnaryTransform::Identity>::value ||
+                platform::is_same<Transform, cutlass::transform::thread::UnaryTransform::Conjugate>::value,
+                  "Unary Operator not supported.");
+
   CUTLASS_HOST_DEVICE
   result_type operator()(source_type const &s) {
-    return s;
+      if( platform::is_same<Transform, cutlass::transform::thread::UnaryTransform::Identity>::value )
+      {
+          return s;
+      } else {
+          result_type result;
+          for (int i = 0; i < N; ++i) {
+              result[i] = conj(s[i]);
+          }
+          return result;
+      }
   }
 };
 
diff --git a/include/cutlass/numeric_types.h b/include/cutlass/numeric_types.h
index 346e847f..c8ec08b7 100644
--- a/include/cutlass/numeric_types.h
+++ b/include/cutlass/numeric_types.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -28,12 +34,6 @@
 */
 #pragma once
 
-#if defined(__CUDACC_RTC__)
-#include <cuda/std/cassert>
-#else
-#include <assert.h>
-#endif
-
 #include "cutlass/cutlass.h"
 
 namespace cutlass {
@@ -63,6 +63,22 @@ struct sizeof_bits<bin1_t> {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+template <size_t... Seq>
+struct index_sequence;
+
+template <size_t N, size_t... Next>
+struct index_sequence_helper : index_sequence_helper<N - 1, N - 1, Next...> {};
+
+template <size_t... Next>
+struct index_sequence_helper<0, 0, Next...> {
+  using type = index_sequence<0, Next...>;
+};
+
+template <size_t N>
+using make_index_sequence = typename index_sequence_helper<N>::type;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
 }  // namespace cutlass
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/pitch_linear_coord.h b/include/cutlass/pitch_linear_coord.h
index 86c60d4e..0830554a 100644
--- a/include/cutlass/pitch_linear_coord.h
+++ b/include/cutlass/pitch_linear_coord.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/platform/platform.h b/include/cutlass/platform/platform.h
index 86ff82b7..a7719921 100644
--- a/include/cutlass/platform/platform.h
+++ b/include/cutlass/platform/platform.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/predicate_vector.h b/include/cutlass/predicate_vector.h
index 34fc8a3a..3ed94389 100644
--- a/include/cutlass/predicate_vector.h
+++ b/include/cutlass/predicate_vector.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/quaternion.h b/include/cutlass/quaternion.h
index d4d875d7..c9af74ca 100644
--- a/include/cutlass/quaternion.h
+++ b/include/cutlass/quaternion.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -599,6 +605,14 @@ Quaternion<Element> operator/(Element s, Quaternion<Element> const &q) {
   return s * reciprocal(q);
 }
 
+/// Comparison 
+template <typename Element>
+CUTLASS_HOST_DEVICE
+bool operator<(Quaternion<Element> const &lhs, Quaternion<Element> const &rhs) {
+  //TODO
+  return true; 
+}
+
 /// Rotates a 3-vector assuming this is a unit quaternion (a spinor). This avoids computing
 /// a reciprocal.
 template <typename Element>
diff --git a/include/cutlass/real.h b/include/cutlass/real.h
index ba309b1d..72059a2f 100644
--- a/include/cutlass/real.h
+++ b/include/cutlass/real.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/reduction/device/reduce_split_k.h b/include/cutlass/reduction/device/reduce_split_k.h
index cb65113f..972708e6 100644
--- a/include/cutlass/reduction/device/reduce_split_k.h
+++ b/include/cutlass/reduction/device/reduce_split_k.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/reduction/device/tensor_reduce.h b/include/cutlass/reduction/device/tensor_reduce.h
index d9d12777..7b5e436a 100644
--- a/include/cutlass/reduction/device/tensor_reduce.h
+++ b/include/cutlass/reduction/device/tensor_reduce.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/reduction/device/tensor_reduce_affine_contiguous.h b/include/cutlass/reduction/device/tensor_reduce_affine_contiguous.h
index dd4740de..eae99b38 100644
--- a/include/cutlass/reduction/device/tensor_reduce_affine_contiguous.h
+++ b/include/cutlass/reduction/device/tensor_reduce_affine_contiguous.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/reduction/device/tensor_reduce_affine_strided.h b/include/cutlass/reduction/device/tensor_reduce_affine_strided.h
index 6c127467..42302f2b 100644
--- a/include/cutlass/reduction/device/tensor_reduce_affine_strided.h
+++ b/include/cutlass/reduction/device/tensor_reduce_affine_strided.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/reduction/kernel/reduce_split_k.h b/include/cutlass/reduction/kernel/reduce_split_k.h
index 24599d8e..359b38a3 100644
--- a/include/cutlass/reduction/kernel/reduce_split_k.h
+++ b/include/cutlass/reduction/kernel/reduce_split_k.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/reduction/kernel/tensor_reduce_affine_contiguous.h b/include/cutlass/reduction/kernel/tensor_reduce_affine_contiguous.h
index 490f8240..92371753 100644
--- a/include/cutlass/reduction/kernel/tensor_reduce_affine_contiguous.h
+++ b/include/cutlass/reduction/kernel/tensor_reduce_affine_contiguous.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/reduction/kernel/tensor_reduce_affine_strided.h b/include/cutlass/reduction/kernel/tensor_reduce_affine_strided.h
index 92f3e388..3429fb58 100644
--- a/include/cutlass/reduction/kernel/tensor_reduce_affine_strided.h
+++ b/include/cutlass/reduction/kernel/tensor_reduce_affine_strided.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/reduction/thread/reduce.h b/include/cutlass/reduction/thread/reduce.h
index ff105841..e6574d7a 100644
--- a/include/cutlass/reduction/thread/reduce.h
+++ b/include/cutlass/reduction/thread/reduce.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/reduction/thread/reduction_operators.h b/include/cutlass/reduction/thread/reduction_operators.h
index f160c502..f9d4059b 100644
--- a/include/cutlass/reduction/thread/reduction_operators.h
+++ b/include/cutlass/reduction/thread/reduction_operators.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/reduction/threadblock_swizzle.h b/include/cutlass/reduction/threadblock_swizzle.h
index acdcea02..b97dd372 100644
--- a/include/cutlass/reduction/threadblock_swizzle.h
+++ b/include/cutlass/reduction/threadblock_swizzle.h
@@ -1,25 +1,31 @@
 /***************************************************************************************************
-* Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
-*
-* Redistribution and use in source and binary forms, with or without modification, are permitted
-* provided that the following conditions are met:
-*     * Redistributions of source code must retain the above copyright notice, this list of
-*       conditions and the following disclaimer.
-*     * Redistributions in binary form must reproduce the above copyright notice, this list of
-*       conditions and the following disclaimer in the documentation and/or other materials
-*       provided with the distribution.
-*     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-*       to endorse or promote products derived from this software without specific prior written
-*       permission.
-*
-* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
 /*! \file
diff --git a/include/cutlass/relatively_equal.h b/include/cutlass/relatively_equal.h
index e6b6d7c1..c3964263 100644
--- a/include/cutlass/relatively_equal.h
+++ b/include/cutlass/relatively_equal.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/semaphore.h b/include/cutlass/semaphore.h
index c701079a..5765f48f 100644
--- a/include/cutlass/semaphore.h
+++ b/include/cutlass/semaphore.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/subbyte_reference.h b/include/cutlass/subbyte_reference.h
index 04a9a567..d75f4b00 100644
--- a/include/cutlass/subbyte_reference.h
+++ b/include/cutlass/subbyte_reference.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/tensor_coord.h b/include/cutlass/tensor_coord.h
index 13c94c59..4cb1e7be 100644
--- a/include/cutlass/tensor_coord.h
+++ b/include/cutlass/tensor_coord.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/tensor_ref.h b/include/cutlass/tensor_ref.h
index a7792d3f..8236fdc5 100644
--- a/include/cutlass/tensor_ref.h
+++ b/include/cutlass/tensor_ref.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -222,7 +228,7 @@ class TensorRef {
   template<typename _Magic = int>
   CUTLASS_HOST_DEVICE
   TensorRef(
-    NonConstTensorRef const &ref,             ///< TensorRef to non-const data
+    NonConstTensorRef const &ref,              ///< TensorRef to non-const data
     ///SFINAE trick to avoid creating a copy-constructor when Element_ is already non-const
     _Magic magic = (typename platform::enable_if< ! platform::is_same<NonConstTensorRef, TensorRef<Element_, Layout_> >::value, _Magic>::type)0
   ):
diff --git a/include/cutlass/tensor_ref_planar_complex.h b/include/cutlass/tensor_ref_planar_complex.h
index 31eef43e..6aa5cd42 100644
--- a/include/cutlass/tensor_ref_planar_complex.h
+++ b/include/cutlass/tensor_ref_planar_complex.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/tensor_view.h b/include/cutlass/tensor_view.h
index f17b6645..e1cbce78 100644
--- a/include/cutlass/tensor_view.h
+++ b/include/cutlass/tensor_view.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/tensor_view_planar_complex.h b/include/cutlass/tensor_view_planar_complex.h
index e4018267..7ec2b9be 100644
--- a/include/cutlass/tensor_view_planar_complex.h
+++ b/include/cutlass/tensor_view_planar_complex.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/tfloat32.h b/include/cutlass/tfloat32.h
index 3ddf61ff..b522b600 100644
--- a/include/cutlass/tfloat32.h
+++ b/include/cutlass/tfloat32.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/thread/matrix.h b/include/cutlass/thread/matrix.h
index ec3ba8b5..238f0955 100644
--- a/include/cutlass/thread/matrix.h
+++ b/include/cutlass/thread/matrix.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/trace.h b/include/cutlass/trace.h
index 133485b1..fb32cb81 100644
--- a/include/cutlass/trace.h
+++ b/include/cutlass/trace.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/transform/pitch_linear_thread_map.h b/include/cutlass/transform/pitch_linear_thread_map.h
index 3a97ac22..803df22a 100644
--- a/include/cutlass/transform/pitch_linear_thread_map.h
+++ b/include/cutlass/transform/pitch_linear_thread_map.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/transform/thread/transpose.h b/include/cutlass/transform/thread/transpose.h
index 5bf68b92..8e529c41 100644
--- a/include/cutlass/transform/thread/transpose.h
+++ b/include/cutlass/transform/thread/transpose.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/transform/thread/unaryOp.h b/include/cutlass/transform/thread/unaryOp.h
index 813fce49..77645da9 100644
--- a/include/cutlass/transform/thread/unaryOp.h
+++ b/include/cutlass/transform/thread/unaryOp.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/transform/threadblock/predicated_tile_access_iterator.h b/include/cutlass/transform/threadblock/predicated_tile_access_iterator.h
index 3d9b5fb4..d45c4441 100644
--- a/include/cutlass/transform/threadblock/predicated_tile_access_iterator.h
+++ b/include/cutlass/transform/threadblock/predicated_tile_access_iterator.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.                                 
- *                                                                                                    
- * Redistribution and use in source and binary forms, with or without modification, are permitted     
- * provided that the following conditions are met:                                                    
- *     * Redistributions of source code must retain the above copyright notice, this list of          
- *       conditions and the following disclaimer.                                                     
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of       
- *       conditions and the following disclaimer in the documentation and/or other materials          
- *       provided with the distribution.                                                              
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used     
- *       to endorse or promote products derived from this software without specific prior written     
- *       permission.                                                                                  
- *                                                                                                    
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR     
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND   
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE    
- * FOR ANY DIRECT,INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,      
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;    
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,      
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE     
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
  *
  **************************************************************************************************/
@@ -283,7 +289,7 @@ class PredicatedTileAccessIteratorPredicates {
 
   /// Returns whether access is valid or not
   CUTLASS_HOST_DEVICE
-  bool valid() {
+  bool valid() const {
 
     
     int pred_idx = 
@@ -305,7 +311,7 @@ class PredicatedTileAccessIteratorPredicates {
 /// PredicatedTileAccessIterator
 ///
 template <typename Shape, typename Element, typename Layout, int AdvanceRank,
-          typename ThreadMap, typename AccessType>
+          typename ThreadMap, typename AccessType, bool Gather = false>
 class PredicatedTileAccessIterator;
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -313,9 +319,9 @@ class PredicatedTileAccessIterator;
 /// Specialization of PredicatedTileAccessIterator for pitch-linear data.
 ///
 template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, typename AccessType_>
+          typename ThreadMap_, typename AccessType_, bool Gather>
 class PredicatedTileAccessIterator<Shape_, Element_, layout::PitchLinear,
-                                   AdvanceRank, ThreadMap_, AccessType_> {
+                                   AdvanceRank, ThreadMap_, AccessType_, Gather> {
  public:
   static_assert(
       AdvanceRank == 0 || AdvanceRank == 1,
@@ -390,6 +396,17 @@ class PredicatedTileAccessIterator<Shape_, Element_, layout::PitchLinear,
   /// Used for out-of-order visitation
   bool is_residue_tile_;
 
+  /// Below is used when Gather is turned on.  We need to record strided_offset
+  /// and contiguous_offset seperated to compute the offset by using
+  ///
+  /// offset = contiguous_offset + indices[strided_offset]
+  ///
+
+  /// Gather indices
+  int const *indices_;
+
+  Index gather_offset_strided;
+
  private:
   /// Computes predicates based on internally tracked per-thread offset.
   CUTLASS_DEVICE
@@ -416,19 +433,27 @@ class PredicatedTileAccessIterator<Shape_, Element_, layout::PitchLinear,
       /// ID of each participating thread
       int thread_id,
       /// Initial offset of threadblock
-      TensorCoord const &threadblock_offset)
+      TensorCoord const &threadblock_offset,
+      /// Gather indices
+      int const *indices = nullptr)
       : params_(params),
 	pointer_(reinterpret_cast<BytePointer>(
             const_cast<NonConstPointer>(pointer))),
 	the_predicates(extent),
-        is_residue_tile_(true) {
+        is_residue_tile_(true),
+        indices_(indices) {
 
     the_predicates.set_predicates(thread_id, threadblock_offset);
           
     // update internal pointers
     Layout layout(params_.stride_);
-    add_pointer_offset(layout(the_predicates.thread_offset_));
 
+    if (!Gather) {
+      add_pointer_offset(layout(the_predicates.thread_offset_));
+    } else {
+      gather_offset_strided = the_predicates.thread_offset_.strided();
+      add_pointer_offset(layout(make_Coord(the_predicates.thread_offset_.contiguous(), 0)));
+    }
   }
 
   /// Construct a PredicatedTileAccessIterator with zero threadblock offset
@@ -465,33 +490,68 @@ class PredicatedTileAccessIterator<Shape_, Element_, layout::PitchLinear,
 
       the_predicates.thread_offset_ += the_predicates.residue_offset_;
 
+      the_predicates.compute_predicates_(the_predicates.extent_, true);
+
       Layout layout(params_.stride_);
-      add_pointer_offset(layout(the_predicates.residue_offset_));
 
-      the_predicates.compute_predicates_(the_predicates.extent_, true);
+      if (!Gather) {
+        add_pointer_offset(layout(the_predicates.residue_offset_));
 
-      if (kAdvanceRank) {
-        pointer_ += params_.inc_advance_ * LongIndex(tile_offset.strided() - 1);
-        pointer_ += Shape::kContiguous * tile_offset.contiguous();
+        if (kAdvanceRank) {
+          pointer_ += params_.inc_advance_ * LongIndex(tile_offset.strided() - 1);
+          pointer_ += Shape::kContiguous * tile_offset.contiguous();
+        } else {
+          pointer_ += params_.inc_advance_ * LongIndex(tile_offset.contiguous() - 1);
+          pointer_ += Shape::kStrided * tile_offset.strided();
+        }
       } else {
-        pointer_ += params_.inc_advance_ * LongIndex(tile_offset.contiguous() - 1);
-        pointer_ += Shape::kStrided * tile_offset.strided();
+        gather_offset_strided = the_predicates.thread_offset_.strided();
+        add_pointer_offset(layout(make_Coord(the_predicates.residue_offset_.contiguous(), 0)));
+
+        if (kAdvanceRank) {
+          gather_offset_strided += Shape::kStrided * (tile_offset.strided() - 1);
+          add_pointer_offset(Shape::kContiguous * tile_offset.contiguous());
+        } else {
+          add_pointer_offset(Shape::kContiguous * (tile_offset.contiguous() - 1));
+          gather_offset_strided += Shape::kStrided * tile_offset.strided();
+        }
       }
     } else {
-      if (kAdvanceRank) {
-        pointer_ += params_.inc_advance_ * LongIndex(tile_offset.strided());
-        pointer_ += Shape::kContiguous * tile_offset.contiguous();
+      if (!Gather) {
+        if (kAdvanceRank) {
+          pointer_ += params_.inc_advance_ * LongIndex(tile_offset.strided());
+          pointer_ += Shape::kContiguous * tile_offset.contiguous();
+        } else {
+          pointer_ += params_.inc_advance_ * LongIndex(tile_offset.contiguous());
+          pointer_ += Shape::kStrided * tile_offset.strided();
+        }
       } else {
-        pointer_ += params_.inc_advance_ * LongIndex(tile_offset.contiguous());
-        pointer_ += Shape::kStrided * tile_offset.strided();
+        add_pointer_offset(Shape::kContiguous * tile_offset.contiguous());
+        gather_offset_strided += Shape::kStrided * tile_offset.strided();
       }
     }
+
     is_residue_tile_ = false;
   }
 
   /// Returns a pointer
   CUTLASS_HOST_DEVICE
   AccessType *get() const {
+    if (Gather) {
+      assert(indices_);
+
+      if (!valid()) {
+        return nullptr;
+      }
+
+      LongIndex contiguous_offset = the_predicates.iteration_contiguous_ * (ThreadMap::Delta::kContiguous * sizeof_bits<Element>::value / 8) + the_predicates.iteration_vector_;
+      int strided_index = gather_offset_strided + the_predicates.iteration_strided_ * ThreadMap::Delta::kStrided;
+
+      LongIndex strided_offset = indices_[strided_index] * LongIndex(params_.stride_) * sizeof_bits<Element>::value / 8;
+
+      return reinterpret_cast<AccessType *>(pointer_ + contiguous_offset + strided_offset);
+    }
+
     return reinterpret_cast<AccessType *>(
         pointer_ + 
         the_predicates.iteration_contiguous_ * (ThreadMap::Delta::kContiguous * sizeof_bits<Element>::value) / 8) + the_predicates.iteration_vector_;
@@ -521,7 +581,10 @@ class PredicatedTileAccessIterator<Shape_, Element_, layout::PitchLinear,
     ++the_predicates.iteration_strided_;
 
     if (the_predicates.iteration_strided_ < ThreadMap::Iterations::kStrided) {
-      pointer_ += params_.inc_strided_;
+      if (!Gather) {
+        pointer_ += params_.inc_strided_;
+      }
+
       return *this;
     }
 
@@ -529,13 +592,15 @@ class PredicatedTileAccessIterator<Shape_, Element_, layout::PitchLinear,
     // which means we enter the next tile.
     the_predicates.iteration_strided_ = 0;
 
-    // advance to next tile
-    pointer_ += params_.inc_next_;
-
-    // now return to start tile - if the iterator is subsequently advanced, this
-    // subtraction as well as the subsequent integer addition are both elided by
-    // the compiler.
-    pointer_ -= params_.inc_advance_;
+    if (!Gather) {
+      // advance to next tile
+      pointer_ += params_.inc_next_;
+  
+      // now return to start tile - if the iterator is subsequently advanced, this
+      // subtraction as well as the subsequent integer addition are both elided by
+      // the compiler.
+      pointer_ -= params_.inc_advance_;
+    }
 
     return *this;
   }
@@ -574,10 +639,9 @@ class PredicatedTileAccessIterator<Shape_, Element_, layout::PitchLinear,
 
   /// Returns whether access is valid or not
   CUTLASS_HOST_DEVICE
-  bool valid() {
+  bool valid() const {
     return the_predicates.valid();
   }
-
 };
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -590,9 +654,9 @@ class PredicatedTileAccessIterator<Shape_, Element_, layout::PitchLinear,
 ///            MaskedTileIteratorConcept
 ///
 template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, typename AccessType_>
+          typename ThreadMap_, typename AccessType_, bool Gather>
 class PredicatedTileAccessIterator<Shape_, Element_, layout::ColumnMajor,
-                                   AdvanceRank, ThreadMap_, AccessType_> {
+                                   AdvanceRank, ThreadMap_, AccessType_, Gather> {
  public:
   static_assert(
       AdvanceRank == 0 || AdvanceRank == 1,
@@ -618,7 +682,7 @@ class PredicatedTileAccessIterator<Shape_, Element_, layout::ColumnMajor,
 
   using UnderlyingIterator = PredicatedTileAccessIterator<
       layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
-      layout::PitchLinear, (kAdvanceRank == 0 ? 0 : 1), ThreadMap, AccessType>;
+      layout::PitchLinear, (kAdvanceRank == 0 ? 0 : 1), ThreadMap, AccessType, Gather>;
 
   /// Predicate vector stores mask to guard accesses
   using Mask = typename UnderlyingIterator::Mask;
@@ -672,12 +736,15 @@ class PredicatedTileAccessIterator<Shape_, Element_, layout::ColumnMajor,
       ///< ID of each participating thread
       int thread_id,
       ///< Initial offset of threadblock
-      TensorCoord const &threadblock_offset)
+      TensorCoord const &threadblock_offset,
+      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+      )
       : iterator_(params.params_, pointer,
                   layout::PitchLinearCoord(extent.row(), extent.column()),
                   thread_id,
                   layout::PitchLinearCoord(threadblock_offset.row(),
-                                           threadblock_offset.column())) {}
+                                           threadblock_offset.column()),
+                  indices) {}
 
   /// Construct a PredicatedTileAccessIterator with zero threadblock offset
   CUTLASS_HOST_DEVICE
@@ -771,9 +838,9 @@ class PredicatedTileAccessIterator<Shape_, Element_, layout::ColumnMajor,
 ///            MaskedTileIteratorConcept
 ///
 template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, typename AccessType_>
+          typename ThreadMap_, typename AccessType_, bool Gather>
 class PredicatedTileAccessIterator<Shape_, Element_, layout::RowMajor,
-                                   AdvanceRank, ThreadMap_, AccessType_> {
+                                   AdvanceRank, ThreadMap_, AccessType_, Gather> {
  public:
   static_assert(
       AdvanceRank == 0 || AdvanceRank == 1,
@@ -799,7 +866,7 @@ class PredicatedTileAccessIterator<Shape_, Element_, layout::RowMajor,
 
   using UnderlyingIterator = PredicatedTileAccessIterator<
       layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
-      layout::PitchLinear, (kAdvanceRank == 0 ? 1 : 0), ThreadMap, AccessType>;
+      layout::PitchLinear, (kAdvanceRank == 0 ? 1 : 0), ThreadMap, AccessType, Gather>;
 
   static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
 
@@ -853,12 +920,15 @@ class PredicatedTileAccessIterator<Shape_, Element_, layout::RowMajor,
       ///< ID of each participating thread
       int thread_id,
       ///< Initial offset of threadblock
-      TensorCoord const &threadblock_offset)
+      TensorCoord const &threadblock_offset,
+      /// Gather indices
+      int const *indices = nullptr)
       : iterator_(params.params_, pointer,
                   layout::PitchLinearCoord(extent.column(), extent.row()),
                   thread_id,
                   layout::PitchLinearCoord(threadblock_offset.column(),
-                                           threadblock_offset.row())) {}
+                                           threadblock_offset.row()),
+                  indices) {}
 
   /// Construct a PredicatedTileAccessIterator with zero threadblock offset
   CUTLASS_HOST_DEVICE
@@ -954,7 +1024,7 @@ class PredicatedTileAccessIterator<Shape_, Element_, layout::RowMajor,
 template <typename Shape_, typename Element_, int AdvanceRank,
           typename ThreadMap_, typename AccessType_>
 class PredicatedTileAccessIterator<Shape_, Element_, layout::AffineRankN<2>,
-                                   AdvanceRank, ThreadMap_, AccessType_> {
+                                   AdvanceRank, ThreadMap_, AccessType_, false> {
  public:
   static_assert(
       AdvanceRank == 0 || AdvanceRank == 1,
@@ -1087,7 +1157,9 @@ class PredicatedTileAccessIterator<Shape_, Element_, layout::AffineRankN<2>,
       ///< ID of each participating thread
       int thread_id,
       ///< Initial offset of threadblock
-      TensorCoord const &threadblock_offset)
+      TensorCoord const &threadblock_offset,
+      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+      )
       : params_(params),
         pointer_(reinterpret_cast<BytePointer>(
             const_cast<NonConstPointer>(pointer))),
@@ -1099,7 +1171,6 @@ class PredicatedTileAccessIterator<Shape_, Element_, layout::AffineRankN<2>,
     // update internal pointers
     Layout layout(params_.stride_);
     add_pointer_offset(layout(the_predicates.thread_offset_));
-
   }
 
   /// Construct a PredicatedTileAccessIterator with zero threadblock offset
@@ -1256,7 +1327,7 @@ class PredicatedTileAccessIterator<Shape_, Element_, layout::AffineRankN<2>,
 template <typename Shape_, typename Element_, int AdvanceRank,
           typename ThreadMap_, typename AccessType_>
 class PredicatedTileAccessIterator<Shape_, Element_, layout::AffineRank2ColumnMajor,
-                                   AdvanceRank, ThreadMap_, AccessType_> {
+                                   AdvanceRank, ThreadMap_, AccessType_, false> {
  public:
   static_assert(
       AdvanceRank == 0 || AdvanceRank == 1,
@@ -1332,7 +1403,9 @@ class PredicatedTileAccessIterator<Shape_, Element_, layout::AffineRank2ColumnMa
       ///< ID of each participating thread
       int thread_id,
       ///< Initial offset of threadblock
-      TensorCoord const &threadblock_offset)
+      TensorCoord const &threadblock_offset,
+      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+      )
       : iterator_(params.params_, pointer,
                   layout::PitchLinearCoord(extent.row(), extent.column()),
                   thread_id,
@@ -1433,7 +1506,7 @@ class PredicatedTileAccessIterator<Shape_, Element_, layout::AffineRank2ColumnMa
 template <typename Shape_, typename Element_, int AdvanceRank,
           typename ThreadMap_, typename AccessType_>
 class PredicatedTileAccessIterator<Shape_, Element_, layout::AffineRank2RowMajor,
-                                   AdvanceRank, ThreadMap_, AccessType_> {
+                                   AdvanceRank, ThreadMap_, AccessType_, false> {
  public:
   static_assert(
       AdvanceRank == 0 || AdvanceRank == 1,
@@ -1509,7 +1582,9 @@ class PredicatedTileAccessIterator<Shape_, Element_, layout::AffineRank2RowMajor
       ///< ID of each participating thread
       int thread_id,
       ///< Initial offset of threadblock
-      TensorCoord const &threadblock_offset)
+      TensorCoord const &threadblock_offset,
+      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+      )
       : iterator_(params.params_, pointer,
                   layout::PitchLinearCoord(extent.column(), extent.row()),
                   thread_id,
@@ -1613,7 +1688,7 @@ template <typename Shape_, typename Element_, int AdvanceRank,
           typename ThreadMap_, typename AccessType_, int InterleavedK>
 class PredicatedTileAccessIterator<Shape_, Element_,
                                    layout::ColumnMajorInterleaved<InterleavedK>,
-                                   AdvanceRank, ThreadMap_, AccessType_> {
+                                   AdvanceRank, ThreadMap_, AccessType_, false> {
  public:
   static_assert(
       AdvanceRank == 0 || AdvanceRank == 1,
@@ -1693,7 +1768,9 @@ class PredicatedTileAccessIterator<Shape_, Element_,
       /// ID of each participating thread
       int thread_id,
       /// Initial offset of threadblock
-      TensorCoord const &threadblock_offset)
+      TensorCoord const &threadblock_offset,
+      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+      )
       : iterator_(params.params_, pointer,
                   layout::PitchLinearCoord(extent.row() * kInterleavedK,
                                            extent.column() / kInterleavedK),
@@ -1796,7 +1873,7 @@ template <typename Shape_, typename Element_, int AdvanceRank,
           typename ThreadMap_, typename AccessType_, int InterleavedK>
 class PredicatedTileAccessIterator<Shape_, Element_,
                                    layout::RowMajorInterleaved<InterleavedK>,
-                                   AdvanceRank, ThreadMap_, AccessType_> {
+                                   AdvanceRank, ThreadMap_, AccessType_, false> {
  public:
   static_assert(
       AdvanceRank == 0 || AdvanceRank == 1,
@@ -1877,7 +1954,9 @@ class PredicatedTileAccessIterator<Shape_, Element_,
       /// ID of each participating thread
       int thread_id,
       /// Initial offset of threadblock
-      TensorCoord const &threadblock_offset)
+      TensorCoord const &threadblock_offset,
+      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+      )
       : iterator_(params.params_, pointer,
                   layout::PitchLinearCoord(extent.column() * kInterleavedK,
                                            extent.row() / kInterleavedK),
diff --git a/include/cutlass/transform/threadblock/predicated_tile_access_iterator_2dthreadtile.h b/include/cutlass/transform/threadblock/predicated_tile_access_iterator_2dthreadtile.h
index aa97ece0..2ee60833 100644
--- a/include/cutlass/transform/threadblock/predicated_tile_access_iterator_2dthreadtile.h
+++ b/include/cutlass/transform/threadblock/predicated_tile_access_iterator_2dthreadtile.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.                                 
- *                                                                                                    
- * Redistribution and use in source and binary forms, with or without modification, are permitted     
- * provided that the following conditions are met:                                                    
- *     * Redistributions of source code must retain the above copyright notice, this list of          
- *       conditions and the following disclaimer.                                                     
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of       
- *       conditions and the following disclaimer in the documentation and/or other materials          
- *       provided with the distribution.                                                              
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used     
- *       to endorse or promote products derived from this software without specific prior written     
- *       permission.                                                                                  
- *                                                                                                    
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR     
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND   
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE    
- * FOR ANY DIRECT,INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,      
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;    
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,      
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE     
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
  *
  **************************************************************************************************/
diff --git a/include/cutlass/transform/threadblock/predicated_tile_access_iterator_params.h b/include/cutlass/transform/threadblock/predicated_tile_access_iterator_params.h
index 4ebd35c5..251c36e2 100755
--- a/include/cutlass/transform/threadblock/predicated_tile_access_iterator_params.h
+++ b/include/cutlass/transform/threadblock/predicated_tile_access_iterator_params.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -67,7 +73,17 @@ struct PredicatedTileAccessIteratorDesc {
     advance_rank(advance_rank_),
     threadblock_shape(threadblock_shape_),
     threadmap_iterations(threadmap_iterations_),
-    threadmap_delta(threadmap_delta_) { }
+    threadmap_delta(threadmap_delta_)
+  {
+    #if 0
+    printf("PredicatedTileAccessIteratorDesc(%d, %d, {%d, %d}, {%d, %d}, {%d, %d}})\n",
+      element_size_bits,
+      advance_rank,
+      threadblock_shape.contiguous(), threadblock_shape.strided(),
+      threadmap_iterations.contiguous(), threadmap_iterations.strided(),
+      threadmap_delta.contiguous(), threadmap_delta.strided());
+    #endif
+  }
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -118,8 +134,7 @@ struct MakePredicatedTileAccessIteratorDesc <
   PredicatedTileAccessIteratorDesc operator()() {
 
     return UnderlyingMakeOperator()();
-}
-
+  }
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -141,9 +156,9 @@ struct MakePredicatedTileAccessIteratorDesc <
   PredicatedTileAccessIteratorDesc operator()() {
 
     return UnderlyingMakeOperator()();
-}
-
+  }
 };
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Specialization of PredicatedTileAccessIterator for column-major interleaved data.
@@ -164,9 +179,9 @@ struct MakePredicatedTileAccessIteratorDesc <
   PredicatedTileAccessIteratorDesc operator()() {
 
     return UnderlyingMakeOperator()();
-}
-
+  }
 };
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Specialization of PredicatedTileAccessIterator for roww-major interleaved data.
@@ -187,9 +202,9 @@ struct MakePredicatedTileAccessIteratorDesc <
   PredicatedTileAccessIteratorDesc operator()() {
 
     return UnderlyingMakeOperator()();
-}
-
+  }
 };
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 //
@@ -242,7 +257,6 @@ struct PredicatedTileAccessIteratorParams {
                                    desc.element_size_bits / 8;    
 
     return Status::kSuccess;
-
   }
 
   CUTLASS_HOST_DEVICE
@@ -266,7 +280,6 @@ struct PredicatedTileAccessIteratorParams {
   }
 };
 
-
 ////////////////////////////////////////////////////////////////////////////////
 
 }  // namespace threadblock
diff --git a/include/cutlass/transform/threadblock/predicated_tile_access_iterator_triangular_matrix.h b/include/cutlass/transform/threadblock/predicated_tile_access_iterator_triangular_matrix.h
new file mode 100644
index 00000000..dc0d89d3
--- /dev/null
+++ b/include/cutlass/transform/threadblock/predicated_tile_access_iterator_triangular_matrix.h
@@ -0,0 +1,892 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates calculating the address and predicates to the load of tiles
+   from pitch-linear rank=2 tensors.
+
+    This iterator uses masks to guard out-of-bounds accesses and visits the last
+   "residue" tile first, with the objective of minimizing predicate mask updates
+   during steady-state operation.
+
+    A precomputed "Params" object minimizes the amount of state that must be
+   stored in registers, and integer addition is used to advance the pointer
+   through memory.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// PredicatedTileAccessIteratorTriangularMatrix
+///
+template <typename Shape, typename Element, typename Layout, 
+          int AdvanceRank, typename ThreadMap, 
+          SideMode kSideMode, FillMode kFillMode, DiagType kDiagType, 
+          typename AccessType>
+class PredicatedTileAccessIteratorTriangularMatrix;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorTriangularMatrix for pitch-linear data.
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, SideMode kSideMode, FillMode kFillMode, DiagType kDiagType, typename AccessType_>
+class PredicatedTileAccessIteratorTriangularMatrix<Shape_, Element_, layout::PitchLinear,
+                                   AdvanceRank, ThreadMap_, kSideMode, kFillMode, kDiagType, AccessType_> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using StrideIndex = typename Layout::Stride::Index;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+
+  using CompareOp = typename TrMatrixCompareOp<kFillMode, kDiagType>::Type;
+
+  static_assert( kFillMode == FillMode::kFull || 
+                 ((kFillMode == FillMode::kLower || kFillMode == FillMode::kUpper) && AccessType::kElements == 1), 
+                 "BLAS3 iterator for the triangular/symmetric matrix must use AccessType::kElements as 1");
+
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements), 
+    "Vectors implied by the thread map must be divisible by the access type.");
+
+  static int const kPredicatesPerByte = 4;
+  static int const kPredicatesPerWord = 4 * kPredicatesPerByte;
+
+  static int const kPredicateCount = ThreadMap::Iterations::kCount * kAccessesPerVector;
+
+  /// Number of 32b words containing predicates
+  static int const kPredicateByteCount = 
+    (kPredicateCount + kPredicatesPerByte - 1) / kPredicatesPerByte;
+  static int const kPredicateWordCount = (kPredicateByteCount + 3) / 4;
+
+  static unsigned const kPredicateMask = (1u << kPredicatesPerByte) - 1u;
+
+  static_assert(kPredicateWordCount <= 4, "Too many predicates.");
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = Array<uint32_t, kPredicateWordCount>;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   public:
+    friend PredicatedTileAccessIteratorTriangularMatrix;
+
+   private:
+    /// stride of pitch-linear layout (units of Element)
+    StrideIndex stride_;
+    /// (true)  pitch-linear layout is mapped to row-major matrix 
+    /// (false) pitch-linear layout is mapped to column-major matrix
+    bool is_row_major_;
+    /// for vectorized access across the diagonal boundary guard condition is
+    /// checked for the element on the boundary
+    int access_diagonal_boundary_;    
+    /// amount (in byte) to increment pointer to move to next access along
+    /// strided dimension
+    LongIndex inc_strided_;
+    /// amount (in byte) to increment pointer from last access to first access
+    /// of next tile
+    LongIndex inc_next_;
+    /// amount (in byte) to increment pointer from first access of current tile
+    /// to first access of next tile
+    LongIndex inc_advance_;
+
+   public:
+
+    // Default ctor
+    CUTLASS_HOST_DEVICE
+    Params(): stride_(0), inc_strided_(0), inc_next_(0), inc_advance_(0), is_row_major_(false), access_diagonal_boundary_(0) { }
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout, bool is_row_major, int access_diagonal_boundary) : 
+      stride_(layout.stride(0)), is_row_major_(is_row_major), access_diagonal_boundary_(access_diagonal_boundary) {
+
+      inc_strided_ = (LongIndex(stride_) * ThreadMap::Delta::kStrided) *
+                     sizeof_bits<Element>::value / 8;
+
+      if (kAdvanceRank) {
+        // advance along strided dimension
+        inc_advance_ =
+            Shape::kStrided * LongIndex(stride_) * sizeof_bits<Element>::value / 8;
+      } else {
+        // advance along contiguous dimension
+        inc_advance_ = Shape::kContiguous * sizeof_bits<Element>::value / 8;
+      }
+
+      inc_next_ = inc_advance_ - LongIndex(ThreadMap::Iterations::kStrided - 1) *
+                                     ThreadMap::Delta::kStrided * LongIndex(stride_) *
+                                     sizeof_bits<Element>::value / 8;
+
+    };
+
+
+  };
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char *;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Parameters object with precomputed internal state
+  Params const &params_;
+
+  /// Internal pointer to first access of tile
+  BytePointer pointer_;
+
+  /// Guard predicates
+  uint32_t predicates_[kPredicateWordCount];
+
+  /// Track global memory addresses on the diagonal 
+  /// To ignore imag part for diagonal elements of hermitian matrices
+  uint32_t predicates_onDiag_[kPredicateWordCount];
+
+  /// Size of tensor
+  TensorCoord extent_;
+
+  /// Initial offset for each thread
+  TensorCoord thread_offset_;
+
+  /// Iteration along vectors implied by the thread map
+  int iteration_vector_;
+
+  /// Iteration in the contiguous dimension
+  int iteration_contiguous_;
+
+  /// Iteration in the strided dimension
+  int iteration_strided_;
+
+ private:
+  /// Computes predicates based on internally tracked per-thread offset.
+  CUTLASS_DEVICE
+  void compute_predicates_(
+      /// Extent of the matrix window
+      TensorCoord extent) {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      predicates_[i] = 0u;
+      predicates_onDiag_[i] = 0u;
+    }
+
+    CompareOp compare_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int access_idx = 0; access_idx < ThreadMap::Iterations::kCount * kAccessesPerVector; ++access_idx) {
+
+      int s = access_idx / (ThreadMap::Iterations::kContiguous * kAccessesPerVector);
+      
+      int access_residual = access_idx % (ThreadMap::Iterations::kContiguous * kAccessesPerVector);
+
+      int c = access_residual / kAccessesPerVector;
+      int v = access_residual % kAccessesPerVector;
+
+      TensorCoord iteration_coord(c * ThreadMap::Delta::kContiguous + v * AccessType::kElements,
+                                s * ThreadMap::Delta::kStrided);
+
+      TensorCoord coord = thread_offset_ + iteration_coord;
+
+      bool guard;
+      bool onDiag = false;
+
+      guard = ((coord.strided() < extent.strided()) && 
+                (coord.contiguous() < extent.contiguous()));
+    
+
+      // guard access on the wrong side of the triagular matrix diagonal
+      if (kFillMode == FillMode::kLower || kFillMode == FillMode::kUpper) {
+        coord += TensorCoord{params_.access_diagonal_boundary_, 0};
+
+        bool triagular_guard_row_major = compare_op(coord.strided(), coord.contiguous()) | !params_.is_row_major_;
+        bool triagular_guard_col_major = compare_op(coord.contiguous(), coord.strided()) | params_.is_row_major_;
+        
+        guard = guard && triagular_guard_row_major && triagular_guard_col_major;
+
+        if (kDiagType == DiagType::kUnit) {
+          onDiag = (guard && coord.strided() == coord.contiguous()) ? true : false;
+        }
+      }
+
+      int pred_idx_onDiag = v + kAccessesPerVector * (c + ThreadMap::Iterations::kContiguous * s);
+      int word_idx_onDiag = pred_idx_onDiag / kPredicatesPerWord;
+      int residual_onDiag = pred_idx_onDiag % kPredicatesPerWord;
+      int byte_idx_onDiag = residual_onDiag / kPredicatesPerByte;
+      int bit_idx_onDiag = residual_onDiag % kPredicatesPerByte;
+      
+      predicates_onDiag_[word_idx_onDiag] |= (unsigned(onDiag) << (byte_idx_onDiag * 8 + bit_idx_onDiag));
+
+      int pred_idx = v + kAccessesPerVector * (c + ThreadMap::Iterations::kContiguous * s);
+
+      int word_idx = pred_idx / kPredicatesPerWord;
+      int residual = pred_idx % kPredicatesPerWord;
+      int byte_idx = residual / kPredicatesPerByte;
+      int bit_idx = residual % kPredicatesPerByte;
+      
+      predicates_[word_idx] |= (unsigned(guard) << (byte_idx * 8 + bit_idx));
+
+    }
+
+  }
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorTriangularMatrix(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : params_(params),
+        pointer_(reinterpret_cast<BytePointer>(const_cast<NonConstPointer>(pointer))),
+        extent_(extent) {
+
+
+    // Per-thread offset in logical coordinates of tensor
+    thread_offset_ = threadblock_offset + ThreadMap::initial_offset(thread_id);
+
+    // update internal pointers
+    Layout layout(params_.stride_);
+    add_pointer_offset(layout(thread_offset_));
+
+    compute_predicates_(extent_);
+
+    set_iteration_index(0);
+  }
+
+  /// Construct a PredicatedTileAccessIteratorTriangularMatrix with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorTriangularMatrix(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id)
+      : PredicatedTileAccessIteratorTriangularMatrix(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+
+    iteration_vector_ = index % kAccessesPerVector;
+    int residual_access = index / kAccessesPerVector;
+
+    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
+
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += sizeof_bits<Element>::value * pointer_offset / 8;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+
+    if (kAdvanceRank) {
+      pointer_ += params_.inc_advance_ * LongIndex(tile_offset.strided());
+      pointer_ += Shape::kContiguous * tile_offset.contiguous();
+      thread_offset_ += TensorCoord{0, Shape::kStrided * tile_offset.strided()};
+    } else {
+      pointer_ += params_.inc_advance_ * LongIndex(tile_offset.contiguous());
+      pointer_ += Shape::kStrided * tile_offset.strided();
+      thread_offset_ += TensorCoord{Shape::kContiguous * tile_offset.contiguous(), 0};
+    }
+
+    compute_predicates_(extent_);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(
+        pointer_ + 
+        iteration_contiguous_ * (ThreadMap::Delta::kContiguous * sizeof_bits<Element>::value) / 8) + iteration_vector_;
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorTriangularMatrix &operator++() {
+
+    ++iteration_vector_;
+    if (iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+
+    iteration_vector_ = 0;
+    ++iteration_contiguous_;
+
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      pointer_ += params_.inc_strided_;
+      return *this;
+    }
+
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    iteration_strided_ = 0;
+
+    // advance to next tile
+    pointer_ += params_.inc_next_;
+
+    // now return to start tile - if the iterator is subsequently advanced, this
+    // subtraction as well as the subsequent integer addition are both elided by
+    // the compiler.
+    pointer_ -= params_.inc_advance_;
+
+    return *this;
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorTriangularMatrix operator++(int) {
+    PredicatedTileAccessIteratorTriangularMatrix self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      predicates_[i] = enable ? 0u : predicates_[i];
+    }
+
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      predicates_[i] = 0xffffffff;
+    }
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { 
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      predicates_[i] = mask[i];
+    }
+
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) {
+     CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      mask[i] = predicates_[i];
+    }
+  }
+
+  /// Return if the address in on the diagonal
+  CUTLASS_HOST_DEVICE
+  bool getOnDiag() {
+    int pred_idx = 
+      iteration_vector_ + kAccessesPerVector * (iteration_contiguous_ + iteration_strided_ * ThreadMap::Iterations::kContiguous);
+
+    int word_idx = pred_idx / kPredicatesPerWord;
+    int residual = pred_idx % kPredicatesPerWord;
+    int byte_idx = residual / kPredicatesPerByte;
+    int bit_idx = residual % kPredicatesPerByte;
+    
+    bool pred = (predicates_onDiag_[word_idx] & (1u << (byte_idx * 8 + bit_idx))) != 0;
+    return pred;
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+
+    
+    int pred_idx = 
+      iteration_vector_ + kAccessesPerVector * (iteration_contiguous_ + iteration_strided_ * ThreadMap::Iterations::kContiguous);
+
+    int word_idx = pred_idx / kPredicatesPerWord;
+    int residual = pred_idx % kPredicatesPerWord;
+    int byte_idx = residual / kPredicatesPerByte;
+    int bit_idx = residual % kPredicatesPerByte;
+    
+    bool pred = (predicates_[word_idx] & (1u << (byte_idx * 8 + bit_idx))) != 0;
+    return pred;
+    
+
+    //return true;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorTriangularMatrix for column-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank, typename ThreadMap_, 
+            SideMode kSideMode, FillMode kFillMode, DiagType kDiagType, 
+            typename AccessType_>
+class PredicatedTileAccessIteratorTriangularMatrix<Shape_, Element_, layout::ColumnMajor,
+                                   AdvanceRank, ThreadMap_, kSideMode, kFillMode, kDiagType, 
+                                   AccessType_> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = PredicatedTileAccessIteratorTriangularMatrix<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
+      layout::PitchLinear, (kAdvanceRank == 0 ? 0 : 1), ThreadMap, 
+      kSideMode, kFillMode, kDiagType, AccessType>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  static int const kAccessDiagonalBoundary = 
+    (kFillMode == FillMode::kLower) ? (AccessType::kElements - 1) : 0;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIteratorTriangularMatrix;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0)), false, kAccessDiagonalBoundary){};
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorTriangularMatrix(
+      ///< Precomputed parameters object
+      Params const &params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.row(), extent.column()),
+                  thread_id,
+                  layout::PitchLinearCoord(threadblock_offset.row(),
+                                           threadblock_offset.column())) {}
+
+  /// Construct a PredicatedTileAccessIteratorTriangularMatrix with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorTriangularMatrix(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIteratorTriangularMatrix(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorTriangularMatrix &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorTriangularMatrix operator++(int) {
+    PredicatedTileAccessIteratorTriangularMatrix self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+
+  /// Return if the address in on the diagonal
+  CUTLASS_HOST_DEVICE
+  bool getOnDiag() {
+    return iterator_.getOnDiag();
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorTriangularMatrix for row-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank, typename ThreadMap_, 
+          SideMode kSideMode, FillMode kFillMode, DiagType kDiagType, 
+          typename AccessType_>
+class PredicatedTileAccessIteratorTriangularMatrix<Shape_, Element_, layout::RowMajor, AdvanceRank, ThreadMap_, 
+                                                  kSideMode, kFillMode, kDiagType, AccessType_> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = PredicatedTileAccessIteratorTriangularMatrix<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::PitchLinear, (kAdvanceRank == 0 ? 1 : 0), ThreadMap, 
+      kSideMode, kFillMode, kDiagType, AccessType>;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  static int const kAccessDiagonalBoundary = 
+    (kFillMode == FillMode::kUpper) ? (AccessType::kElements - 1) : 0;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIteratorTriangularMatrix;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0)), true, kAccessDiagonalBoundary){};
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorTriangularMatrix(
+      ///< Precomputed parameters object
+      Params const &params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.column(), extent.row()),
+                  thread_id,
+                  layout::PitchLinearCoord(threadblock_offset.column(),
+                                           threadblock_offset.row())) {}
+
+  /// Construct a PredicatedTileAccessIteratorTriangularMatrix with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorTriangularMatrix(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIteratorTriangularMatrix(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorTriangularMatrix &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorTriangularMatrix operator++(int) {
+    PredicatedTileAccessIteratorTriangularMatrix self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+
+  /// Return if the address in on the diagonal
+  CUTLASS_HOST_DEVICE
+  bool getOnDiag() {
+    return iterator_.getOnDiag();
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace transform
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/transform/threadblock/predicated_tile_iterator.h b/include/cutlass/transform/threadblock/predicated_tile_iterator.h
index de25c54d..23c55414 100644
--- a/include/cutlass/transform/threadblock/predicated_tile_iterator.h
+++ b/include/cutlass/transform/threadblock/predicated_tile_iterator.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -130,7 +136,8 @@ template <
   typename Layout,
   int AdvanceRank,
   typename ThreadMap,
-  int AccessSize = ThreadMap::kElementsPerAccess
+  int AccessSize = ThreadMap::kElementsPerAccess,
+  bool Gather = false
 >
 class PredicatedTileIterator;
 
@@ -144,9 +151,9 @@ class PredicatedTileIterator;
 ///            MaskedTileIteratorConcept
 ///
 template <typename Shape_, typename Element_, int AdvanceRank,
-          typename ThreadMap_, int AccessSize>
+          typename ThreadMap_, int AccessSize, bool Gather>
 class PredicatedTileIterator<Shape_, Element_, layout::PitchLinear, AdvanceRank,
-                             ThreadMap_, AccessSize> {
+                             ThreadMap_, AccessSize, Gather> {
  public:
   static_assert(
       AdvanceRank == 0 || AdvanceRank == 1,
@@ -175,7 +182,7 @@ class PredicatedTileIterator<Shape_, Element_, layout::PitchLinear, AdvanceRank,
   /// Underlying iterator to compute the addresses
   using TileAccessIterator =
       PredicatedTileAccessIterator<Shape, Element, Layout, kAdvanceRank,
-                                   ThreadMap, AccessType>;
+                                   ThreadMap, AccessType, Gather>;
 
   static int const kAccessesPerVector = TileAccessIterator::kAccessesPerVector;
 
@@ -236,9 +243,11 @@ class PredicatedTileIterator<Shape_, Element_, layout::PitchLinear, AdvanceRank,
       /// ID of each participating thread
       int thread_id,
       /// Initial offset of threadblock
-      TensorCoord const &threadblock_offset)
+      TensorCoord const &threadblock_offset,
+      /// Gather indices
+      int const *indices = nullptr)
       : address_iterator_(params.params_, pointer, extent, thread_id,
-                          threadblock_offset) {}
+                          threadblock_offset, indices) {}
 
   /// Construct a PredicatedTileIterator with zero threadblock offset
   CUTLASS_HOST_DEVICE
@@ -394,9 +403,10 @@ template <
   typename Element_,
   int AdvanceRank,
   typename ThreadMap_,
-  int AccessSize
+  int AccessSize,
+  bool Gather
 >
-class PredicatedTileIterator<Shape_, Element_, layout::ColumnMajor, AdvanceRank, ThreadMap_, AccessSize> {
+class PredicatedTileIterator<Shape_, Element_, layout::ColumnMajor, AdvanceRank, ThreadMap_, AccessSize, Gather> {
 public:
 
   static_assert(AdvanceRank == 0 || AdvanceRank == 1, 
@@ -425,7 +435,8 @@ class PredicatedTileIterator<Shape_, Element_, layout::ColumnMajor, AdvanceRank,
     layout::PitchLinear,
     (kAdvanceRank == 0 ? 0 : 1),
     ThreadMap,
-    AccessSize
+    AccessSize,
+    Gather
   >;
 
   using AccessType = typename UnderlyingIterator::AccessType;
@@ -480,15 +491,17 @@ class PredicatedTileIterator<Shape_, Element_, layout::ColumnMajor, AdvanceRank,
     Pointer pointer,                              ///< Pointer to start of tensor
     TensorCoord extent,                           ///< Extent of tensor
     int thread_id,                                ///< ID of each participating thread
-    TensorCoord const &threadblock_offset         ///< Initial offset of threadblock
+    TensorCoord const &threadblock_offset,         ///< Initial offset of threadblock
+    int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
   ):
     iterator_(
       params.params_,
       pointer,
       layout::PitchLinearCoord(extent.row(), extent.column()),
       thread_id,
-      layout::PitchLinearCoord(threadblock_offset.row(), threadblock_offset.column())
-    ) { }
+      layout::PitchLinearCoord(threadblock_offset.row(), threadblock_offset.column()),
+      indices)
+    { }
 
   /// Construct a PredicatedTileIterator with zero threadblock offset
   CUTLASS_HOST_DEVICE
@@ -603,9 +616,10 @@ template <
   typename Element_,
   int AdvanceRank,
   typename ThreadMap_,
-  int AccessSize
+  int AccessSize,
+  bool Gather
 >
-class PredicatedTileIterator<Shape_, Element_, layout::RowMajor, AdvanceRank, ThreadMap_, AccessSize> {
+class PredicatedTileIterator<Shape_, Element_, layout::RowMajor, AdvanceRank, ThreadMap_, AccessSize, Gather> {
 public:
 
   static_assert(AdvanceRank == 0 || AdvanceRank == 1, 
@@ -634,7 +648,8 @@ class PredicatedTileIterator<Shape_, Element_, layout::RowMajor, AdvanceRank, Th
     layout::PitchLinear,
     (kAdvanceRank == 0 ? 1 : 0),
     ThreadMap,
-    AccessSize
+    AccessSize,
+    Gather
   >;
 
   using AccessType = typename UnderlyingIterator::AccessType;
@@ -669,7 +684,6 @@ class PredicatedTileIterator<Shape_, Element_, layout::RowMajor, AdvanceRank, Th
 
   };
 
-
 private:
 
   //
@@ -688,14 +702,16 @@ class PredicatedTileIterator<Shape_, Element_, layout::RowMajor, AdvanceRank, Th
     Pointer pointer,                              ///< Pointer to start of tensor
     TensorCoord extent,                           ///< Extent of tensor
     int thread_id,                                ///< ID of each participating thread
-    TensorCoord const &threadblock_offset         ///< Initial offset of threadblock
+    TensorCoord const &threadblock_offset,        ///< Initial offset of threadblock
+    int const *indices = nullptr                        ///< Gather indices
   ):
     iterator_(
       params.params_,
       pointer,
       layout::PitchLinearCoord(extent.column(), extent.row()),
       thread_id,
-      layout::PitchLinearCoord(threadblock_offset.column(), threadblock_offset.row())
+      layout::PitchLinearCoord(threadblock_offset.column(), threadblock_offset.row()),
+      indices
     ) { }
 
   /// Construct a PredicatedTileIterator with zero threadblock offset
@@ -809,7 +825,7 @@ class PredicatedTileIterator<Shape_, Element_, layout::RowMajor, AdvanceRank, Th
 template <typename Shape_, typename Element_, int AdvanceRank,
           typename ThreadMap_, int AccessSize>
 class PredicatedTileIterator<Shape_, Element_, layout::AffineRankN<2>, AdvanceRank,
-                             ThreadMap_, AccessSize> {
+                             ThreadMap_, AccessSize, false> {
  public:
   static_assert(
       AdvanceRank == 0 || AdvanceRank == 1,
@@ -894,7 +910,9 @@ class PredicatedTileIterator<Shape_, Element_, layout::AffineRankN<2>, AdvanceRa
       /// ID of each participating thread
       int thread_id,
       /// Initial offset of threadblock
-      TensorCoord const &threadblock_offset)
+      TensorCoord const &threadblock_offset,
+      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+      )
       : address_iterator_(params.params_, pointer, extent, thread_id,
                           threadblock_offset) {}
 
@@ -1054,7 +1072,7 @@ template <
   typename ThreadMap_,
   int AccessSize
 >
-class PredicatedTileIterator<Shape_, Element_, layout::AffineRank2ColumnMajor, AdvanceRank, ThreadMap_, AccessSize> {
+class PredicatedTileIterator<Shape_, Element_, layout::AffineRank2ColumnMajor, AdvanceRank, ThreadMap_, AccessSize, false> {
 public:
 
   static_assert(AdvanceRank == 0 || AdvanceRank == 1, 
@@ -1134,7 +1152,8 @@ class PredicatedTileIterator<Shape_, Element_, layout::AffineRank2ColumnMajor, A
     Pointer pointer,                              ///< Pointer to start of tensor
     TensorCoord extent,                           ///< Extent of tensor
     int thread_id,                                ///< ID of each participating thread
-    TensorCoord const &threadblock_offset         ///< Initial offset of threadblock
+    TensorCoord const &threadblock_offset,         ///< Initial offset of threadblock
+    int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
   ):
     iterator_(
       params.params_,
@@ -1259,7 +1278,7 @@ template <
   typename ThreadMap_,
   int AccessSize
 >
-class PredicatedTileIterator<Shape_, Element_, layout::AffineRank2RowMajor, AdvanceRank, ThreadMap_, AccessSize> {
+class PredicatedTileIterator<Shape_, Element_, layout::AffineRank2RowMajor, AdvanceRank, ThreadMap_, AccessSize, false> {
 public:
 
   static_assert(AdvanceRank == 0 || AdvanceRank == 1, 
@@ -1338,7 +1357,8 @@ class PredicatedTileIterator<Shape_, Element_, layout::AffineRank2RowMajor, Adva
     Pointer pointer,                              ///< Pointer to start of tensor
     TensorCoord extent,                           ///< Extent of tensor
     int thread_id,                                ///< ID of each participating thread
-    TensorCoord const &threadblock_offset         ///< Initial offset of threadblock
+    TensorCoord const &threadblock_offset,         ///< Initial offset of threadblock
+    int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
   ):
     iterator_(
       params.params_,
@@ -1449,7 +1469,6 @@ class PredicatedTileIterator<Shape_, Element_, layout::AffineRank2RowMajor, Adva
 
 ////////////////////////////////////////////////////////////////////////////////
 
-
 /// Specialization of PredicatedTileIterator for interleaved data.  It is mapped
 /// to the congruous layout.
 ///
@@ -1463,7 +1482,7 @@ template <typename Shape_, typename Element_, int AdvanceRank,
           typename ThreadMap_, int AccessSize, int InterleavedK>
 class PredicatedTileIterator<Shape_, Element_,
                              layout::ColumnMajorInterleaved<InterleavedK>,
-                             AdvanceRank, ThreadMap_, AccessSize> {
+                             AdvanceRank, ThreadMap_, AccessSize, false> {
  public:
   static_assert(
       AdvanceRank == 0 || AdvanceRank == 1,
@@ -1547,7 +1566,9 @@ class PredicatedTileIterator<Shape_, Element_,
       /// ID of each participating thread
       int thread_id,
       /// Initial offset of threadblock
-      TensorCoord const &threadblock_offset)
+      TensorCoord const &threadblock_offset,
+      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+      )
       : iterator_(params.params_, pointer,
                   layout::PitchLinearCoord(extent.row() * kInterleavedK,
                                            extent.column() / kInterleavedK),
@@ -1649,7 +1670,7 @@ template <typename Shape_, typename Element_, int AdvanceRank,
           typename ThreadMap_, int AccessSize, int InterleavedK>
 class PredicatedTileIterator<Shape_, Element_,
                              layout::RowMajorInterleaved<InterleavedK>,
-                             AdvanceRank, ThreadMap_, AccessSize> {
+                             AdvanceRank, ThreadMap_, AccessSize, false> {
  public:
   static_assert(
       AdvanceRank == 0 || AdvanceRank == 1,
@@ -1732,7 +1753,9 @@ class PredicatedTileIterator<Shape_, Element_,
       /// ID of each participating thread
       int thread_id,
       /// Initial offset of threadblock
-      TensorCoord const &threadblock_offset)
+      TensorCoord const &threadblock_offset,
+      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+      )
       : iterator_(params.params_, pointer,
                   layout::PitchLinearCoord(extent.column() * kInterleavedK,
                                            extent.row() / kInterleavedK),
diff --git a/include/cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h b/include/cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h
index caed0c42..cc3fb030 100644
--- a/include/cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h
+++ b/include/cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -241,7 +247,9 @@ class PredicatedTileIterator2dThreadTile<Shape_, Element_, layout::PitchLinear,
       /// ID of each participating thread
       int thread_id,
       /// Initial offset of threadblock
-      TensorCoord const &threadblock_offset)
+      TensorCoord const &threadblock_offset,
+      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+      )
       : address_iterator_(params.params_, pointer, extent, thread_id,
                           threadblock_offset) {}
 
@@ -475,7 +483,8 @@ class PredicatedTileIterator2dThreadTile<Shape_, Element_, layout::ColumnMajor,
     Pointer pointer,                              ///< Pointer to start of tensor
     TensorCoord extent,                           ///< Extent of tensor
     int thread_id,                                ///< ID of each participating thread
-    TensorCoord const &threadblock_offset         ///< Initial offset of threadblock
+    TensorCoord const &threadblock_offset,         ///< Initial offset of threadblock
+    int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
   ):
     iterator_(
       params.params_,
@@ -671,7 +680,8 @@ class PredicatedTileIterator2dThreadTile<Shape_, Element_, layout::RowMajor, Adv
     Pointer pointer,                              ///< Pointer to start of tensor
     TensorCoord extent,                           ///< Extent of tensor
     int thread_id,                                ///< ID of each participating thread
-    TensorCoord const &threadblock_offset         ///< Initial offset of threadblock
+    TensorCoord const &threadblock_offset,         ///< Initial offset of threadblock
+    int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
   ):
     iterator_(
       params.params_,
diff --git a/include/cutlass/transform/threadblock/predicated_tile_iterator_triangular_matrix.h b/include/cutlass/transform/threadblock/predicated_tile_iterator_triangular_matrix.h
new file mode 100644
index 00000000..8147f9a2
--- /dev/null
+++ b/include/cutlass/transform/threadblock/predicated_tile_iterator_triangular_matrix.h
@@ -0,0 +1,818 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of tiles from pitch-linear rank=2 tensors. 
+
+    This iterator uses masks to guard out-of-bounds accesses and visits the last "residue" tile
+    first, with the objective of minimizing predicate mask updates during steady-state operation.
+
+    A precomputed "Params" object minimizes the amount of state that must be stored in registers,
+    and integer addition is used to advance the pointer through memory.
+*/
+
+#pragma once
+
+#include "cutlass/arch/memory.h"
+#include "cutlass/transform/threadblock/predicated_tile_access_iterator_triangular_matrix.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// PredicatedTileIteratorTriangularMatrix
+///
+/// Satisfies: ForwardTileIteratorConcept | 
+///            ReadableContiguousTileIteratorConcept | 
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+/// Regular tile iterator using a precomputed control structure to minimize register liveness
+/// and integer arithmetic.
+///
+/// Layout is assumed to be invariant at the time the precomputed "Params" object is constructed.
+///
+/// Base pointer and tensor extents may be specified at the time the iterator is constructed.
+/// Subsequently, they are assumed to be immutable.
+///
+/// Adding a logical coordinate offset may be performed at the time the iterator is constructed.
+/// Subsequent additions to logical coordinate offset may be performed but are relatively expensive.
+///
+/// Vistitation order is intended to first visit a "residual" tile that may be partially full in
+/// both the advance dimension and the steady-state dimension. This is assumed to be the last
+/// tile in the iteration sequence. Advancing an iterator that has just been constructed moves to
+/// the first tile that is full in the advance dimension and recomputes predicates. Subsequent
+/// accesses may be performed without updating internal predicates and are efficient in terms of
+/// live register state and pointer arithmetic instructions.
+///
+/// To be efficient, this assumes the iteraor will be dereferenced and advanced at least once
+/// outside any looping structure to minimize integer arithmetic. 
+///
+/// Acceses out of bounds are safe so long as `clear_mask()` is called prior to dereferencing
+/// the iterator.
+///
+///
+/// Example:
+///
+/// An efficient pipeline structure may be constructed as follows:
+///
+// template <typename Iterator>
+// __global__ void kernel(
+//   typename Iterator::Params params, 
+//   typename Iterator::Element *ptr,
+//   TensorCoord extent) {
+//
+//   typename Iterator::Fragment fragment;
+//
+//   TensorCoord threadblock_offset(0, 0);
+//
+//   Iterator iter(params, ptr, extent, threadIdx.x, threadblock_offsets);
+//
+//
+//   fragment = *iter;        // load "residue" tile first
+//   ++iter;                  // advance to first "steady state" tile and update internal masks
+//
+//
+//   #pragma unroll
+//   for (int i = Remaining - 1; i >= 0; --i) {
+//
+//     f(fragment);
+//
+//     if (!i) {
+//       iter.clear_mask();   // light-weight operation to clear masks - subsequent loads become NO-OPs.
+//     }
+//  
+//     fragment = *iter;      // load tile during "steady state" phase
+//     ++iter;                // advance to next tile - lightweight due to steady-state masks
+//   }
+// }
+//
+// void host(TensorView<Element, 2, layout::PitchLinear> view) {
+//
+//   using Iterator = transform::threadblock::PredicatedTileIteratorTriangularMatrix;
+//
+//   typename Iterator::Params params(view.layout());
+//
+//   kernel<Iterator>(params, view.data());
+// }
+///
+///
+template <
+  typename Shape,
+  typename Element,
+  typename Layout,
+  int AdvanceRank,
+  typename ThreadMap,
+  SideMode kSideMode, 
+  FillMode kFillMode, 
+  DiagType kDiagType,
+  int AccessSize = ThreadMap::kElementsPerAccess
+>
+class PredicatedTileIteratorTriangularMatrix;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorTriangularMatrix for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept | 
+///            ReadableContiguousTileIteratorConcept | 
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank, typename ThreadMap_, 
+          SideMode kSideMode, FillMode kFillMode, DiagType kDiagType, 
+          int AccessSize>
+class PredicatedTileIteratorTriangularMatrix<Shape_, Element_, layout::PitchLinear, AdvanceRank, ThreadMap_, 
+                                             kSideMode, kFillMode, kDiagType,
+                                             AccessSize> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  /// Type used for internal memory accesses
+  using AccessType = AlignedArray<Element, AccessSize, (AccessSize * sizeof_bits<Element>::value / 8)>;
+
+  /// Underlying iterator to compute the addresses
+  using TileAccessIterator =
+      PredicatedTileAccessIteratorTriangularMatrix<Shape, Element, Layout, kAdvanceRank,
+                                   ThreadMap, kSideMode, kFillMode, kDiagType, AccessType>;
+
+  static int const kAccessesPerVector = TileAccessIterator::kAccessesPerVector;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount *
+                                               ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename TileAccessIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   public:
+    friend PredicatedTileIteratorTriangularMatrix;
+
+   private:
+    /// Parameters object
+    typename TileAccessIterator::Params params_;
+
+   public:
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout) : params_(layout) { }
+    
+    CUTLASS_HOST_DEVICE
+    Params() { }
+  };
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char *;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Data member to the tile access iterator
+  TileAccessIterator address_iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorTriangularMatrix(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : address_iterator_(params.params_, pointer, extent, thread_id,
+                          threadblock_offset) {}
+
+  /// Construct a PredicatedTileIteratorTriangularMatrix with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorTriangularMatrix(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : PredicatedTileIteratorTriangularMatrix(params, pointer, extent, thread_id,
+                               make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    address_iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorTriangularMatrix &operator++() {
+    if (kAdvanceRank)
+      address_iterator_.add_tile_offset({0, 1});
+    else
+      address_iterator_.add_tile_offset({1, 0});
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorTriangularMatrix operator++(int) {
+    PredicatedTileIteratorTriangularMatrix self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { address_iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { address_iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { address_iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { address_iterator_.get_mask(mask); }
+
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    load_with_byte_offset(frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment &frag, LongIndex byte_offset) {
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+
+          int idx = v + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+          
+          address_iterator_.set_iteration_index(idx);
+          char const *byte_ptr = reinterpret_cast<char const *>(address_iterator_.get()) + byte_offset;
+
+          AccessType const *access_ptr = reinterpret_cast<AccessType const *>(byte_ptr);
+
+          cutlass::arch::global_load<AccessType,
+                                     sizeof(AccessType)
+                                    >(
+              frag_ptr[idx], access_ptr, address_iterator_.valid());
+
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) { load_with_byte_offset(frag, 0); }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    store_with_byte_offset(frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, LongIndex byte_offset) {
+    address_iterator_.set_iteration_index(0);
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+
+          int idx = v + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+
+          char *byte_ptr = reinterpret_cast<char *>(address_iterator_.get()) + byte_offset;
+          AccessType *access_ptr = reinterpret_cast<AccessType *>(byte_ptr);
+
+          if (address_iterator_.valid()) {
+            *access_ptr = frag_ptr[idx];
+          }
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) { store_with_byte_offset(frag, 0); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorTriangularMatrix for column-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept | 
+///            ReadableContiguousTileIteratorConcept | 
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  SideMode kSideMode, 
+  FillMode kFillMode, 
+  DiagType kDiagType,
+  int AccessSize
+>
+class PredicatedTileIteratorTriangularMatrix<Shape_, Element_, layout::ColumnMajor, AdvanceRank, ThreadMap_, 
+                                              kSideMode, kFillMode, kDiagType,
+                                              AccessSize> {
+public:
+
+  static_assert(AdvanceRank == 0 || AdvanceRank == 1, 
+    "Specialization for pitch-linear iterator may along advance along the "
+    "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = PredicatedTileIteratorTriangularMatrix<
+    layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
+    Element,
+    layout::PitchLinear,
+    (kAdvanceRank == 0 ? 0 : 1),
+    ThreadMap,
+    kSideMode, 
+    kFillMode, 
+    kDiagType,
+    AccessSize
+  >;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+  private:
+
+    friend PredicatedTileIteratorTriangularMatrix;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+  public:
+    
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout): params_(layout::PitchLinear(layout.stride(0))) {
+
+    }
+  };
+
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+public:
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset, and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorTriangularMatrix(
+    Params const &params,                         ///< Precomputed parameters object 
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id,                                ///< ID of each participating thread
+    TensorCoord const &threadblock_offset         ///< Initial offset of threadblock
+  ):
+    iterator_(
+      params.params_,
+      pointer,
+      layout::PitchLinearCoord(extent.row(), extent.column()),
+      thread_id,
+      layout::PitchLinearCoord(threadblock_offset.row(), threadblock_offset.column())
+    ) { }
+
+  /// Construct a PredicatedTileIteratorTriangularMatrix with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorTriangularMatrix(
+    Params const &params,                         ///< Precomputed parameters object
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id                                 ///< ID of each participating thread
+  ): PredicatedTileIteratorTriangularMatrix(params, pointer, extent, thread_id, make_Coord(0, 0)) { }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorTriangularMatrix &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorTriangularMatrix operator++(int) {
+    PredicatedTileIteratorTriangularMatrix self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment &frag, LongIndex byte_offset) {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, LongIndex byte_offset) {
+    iterator_.store_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorTriangularMatrix for row-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept | 
+///            ReadableContiguousTileIteratorConcept | 
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  SideMode kSideMode, 
+  FillMode kFillMode, 
+  DiagType kDiagType,
+  int AccessSize
+>
+class PredicatedTileIteratorTriangularMatrix<Shape_, Element_, layout::RowMajor, AdvanceRank, ThreadMap_, 
+                                            kSideMode, kFillMode, kDiagType,
+                                            AccessSize> {
+public:
+
+  static_assert(AdvanceRank == 0 || AdvanceRank == 1, 
+    "Specialization for pitch-linear iterator may along advance along the "
+    "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = PredicatedTileIteratorTriangularMatrix<
+    layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+    Element,
+    layout::PitchLinear,
+    (kAdvanceRank == 0 ? 1 : 0),
+    ThreadMap,
+    kSideMode, 
+    kFillMode, 
+    kDiagType,
+    AccessSize
+  >;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+  private:
+
+    friend PredicatedTileIteratorTriangularMatrix;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+  public:
+    
+    CUTLASS_HOST_DEVICE
+    Params() { } 
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout): params_(layout::PitchLinear(layout.stride(0))) {
+
+    };
+  };
+
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+public:
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset, and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorTriangularMatrix(
+    Params const &params,                         ///< Precomputed parameters object 
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id,                                ///< ID of each participating thread
+    TensorCoord const &threadblock_offset         ///< Initial offset of threadblock
+  ):
+    iterator_(
+      params.params_,
+      pointer,
+      layout::PitchLinearCoord(extent.column(), extent.row()),
+      thread_id,
+      layout::PitchLinearCoord(threadblock_offset.column(), threadblock_offset.row())
+    ) { }
+
+  /// Construct a PredicatedTileIteratorTriangularMatrix with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorTriangularMatrix(
+    Params const &params,                         ///< Precomputed parameters object
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id                                 ///< ID of each participating thread
+  ): PredicatedTileIteratorTriangularMatrix(params, pointer, extent, thread_id, make_Coord(0, 0)) { }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorTriangularMatrix &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorTriangularMatrix operator++(int) {
+    PredicatedTileIteratorTriangularMatrix self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment &frag, LongIndex byte_offset) {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+  
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, LongIndex byte_offset) {
+    iterator_.store_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace transform
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/transform/threadblock/predicated_vector_access_iterator.h b/include/cutlass/transform/threadblock/predicated_vector_access_iterator.h
index 6eeb0a10..8ab254d7 100644
--- a/include/cutlass/transform/threadblock/predicated_vector_access_iterator.h
+++ b/include/cutlass/transform/threadblock/predicated_vector_access_iterator.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/transform/threadblock/regular_tile_access_iterator.h b/include/cutlass/transform/threadblock/regular_tile_access_iterator.h
index ea5715c7..10fc5e96 100644
--- a/include/cutlass/transform/threadblock/regular_tile_access_iterator.h
+++ b/include/cutlass/transform/threadblock/regular_tile_access_iterator.h
@@ -1,27 +1,31 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
- *modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice,
- *this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *notice, this list of conditions and the following disclaimer in the
- *documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its
- *contributors may be used to endorse or promote products derived from this
- *software without specific prior written permission.
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- *DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY DIRECT,
- *INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- *DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- *OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- *NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
- *EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 /*! \file
diff --git a/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h b/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h
index 9e5ea0c6..1b78d024 100644
--- a/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h
+++ b/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h b/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h
index 91ee36aa..a812018b 100644
--- a/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h
+++ b/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h b/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h
index 7d92e855..800e6374 100644
--- a/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h
+++ b/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -535,7 +541,7 @@ class RegularTileAccessIterator<
   AccessType *get() const {
 
     // Map the logical contiguous and strided access to the internal swizzled structure.
-    int uniform_offset = (iteration_strided_ & 0x3) * stride_ + (iteration_strided_ >> 3) * 16;
+    int uniform_offset = (iteration_strided_ & 0x3) * stride_ + (iteration_strided_ >> 3) * 16 + stride_ * ThreadMap::Delta::kContiguous * iteration_contiguous_;
 
     char *access_byte_ptr = reinterpret_cast<char *>(pointer_ + uniform_offset);
 
diff --git a/include/cutlass/transform/threadblock/regular_tile_iterator.h b/include/cutlass/transform/threadblock/regular_tile_iterator.h
index af0b4b1a..7378cfa9 100644
--- a/include/cutlass/transform/threadblock/regular_tile_iterator.h
+++ b/include/cutlass/transform/threadblock/regular_tile_iterator.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h b/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h
index 88678bf7..4581360f 100644
--- a/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h
+++ b/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear_2dthreadtile.h b/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear_2dthreadtile.h
index 96657184..24882dbf 100644
--- a/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear_2dthreadtile.h
+++ b/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear_2dthreadtile.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op.h b/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op.h
index 57f0c8a3..feaa82e6 100644
--- a/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op.h
+++ b/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op_sm70.h b/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op_sm70.h
index 7a14dff3..68e8928c 100644
--- a/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op_sm70.h
+++ b/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op_sm70.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/transform/threadblock/vector_iterator.h b/include/cutlass/transform/threadblock/vector_iterator.h
index 06c2d88e..5cb052bc 100644
--- a/include/cutlass/transform/threadblock/vector_iterator.h
+++ b/include/cutlass/transform/threadblock/vector_iterator.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/transform/warp/vector_fragment_iterator.h b/include/cutlass/transform/warp/vector_fragment_iterator.h
index 62d9ef82..4bd33fe7 100644
--- a/include/cutlass/transform/warp/vector_fragment_iterator.h
+++ b/include/cutlass/transform/warp/vector_fragment_iterator.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/include/cutlass/uint128.h b/include/cutlass/uint128.h
index ab9346c1..a55b5269 100644
--- a/include/cutlass/uint128.h
+++ b/include/cutlass/uint128.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -48,12 +54,14 @@ namespace cutlass {
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Optionally enable GCC's built-in type
-#if defined(__x86_64) && !defined(__CUDA_ARCH__)
-#if defined(__GNUC__)
+#if defined(__x86_64) && !defined(__CUDA_ARCH__) && defined(__GNUC__)
 #define CUTLASS_UINT128_NATIVE
-#elif defined(_MSC_VER)
+#elif defined(_MSC_VER) && defined(_M_AMD64) && !defined(__CUDA_ARCH__)
 #define CUTLASS_INT128_ARITHMETIC
 #include <intrin.h>
+#if _MSC_VER >= 1920
+#define CUTLASS_INT128_ARITHMETIC_DIV
+#include <immintrin.h>
 #endif
 #endif
 
@@ -172,7 +180,7 @@ struct uint128_t {
     uint64_t quotient = 0;
 #if defined(CUTLASS_UINT128_NATIVE)
     quotient = uint64_t(native / divisor);
-#elif defined(CUTLASS_INT128_ARITHMETIC)
+#elif defined(CUTLASS_INT128_ARITHMETIC_DIV)
     // implemented using MSVC's arithmetic intrinsics
     uint64_t remainder = 0;
     quotient = _udiv128(hilo_.hi, hilo_.lo, divisor, &remainder);
@@ -189,7 +197,7 @@ struct uint128_t {
     uint64_t remainder = 0;
 #if defined(CUTLASS_UINT128_NATIVE)
     remainder = uint64_t(native % divisor);
-#elif defined(CUTLASS_INT128_ARITHMETIC)
+#elif defined(CUTLASS_INT128_ARITHMETIC_DIV)
     // implemented using MSVC's arithmetic intrinsics
     (void)_udiv128(hilo_.hi, hilo_.lo, divisor, &remainder);
 #else
@@ -206,7 +214,7 @@ struct uint128_t {
 #if defined(CUTLASS_UINT128_NATIVE)
     quotient = uint64_t(native / divisor);
     remainder = uint64_t(native % divisor);
-#elif defined(CUTLASS_INT128_ARITHMETIC)
+#elif defined(CUTLASS_INT128_ARITHMETIC_DIV)
     // implemented using MSVC's arithmetic intrinsics
     quotient = _udiv128(hilo_.hi, hilo_.lo, divisor, &remainder);
 #else
diff --git a/include/cutlass/wmma_array.h b/include/cutlass/wmma_array.h
index e16a8ef7..fede386c 100644
--- a/include/cutlass/wmma_array.h
+++ b/include/cutlass/wmma_array.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/media/docs/code_organization.md b/media/docs/code_organization.md
index 0d29e4c2..c83c7c90 100644
--- a/media/docs/code_organization.md
+++ b/media/docs/code_organization.md
@@ -220,26 +220,32 @@ of tests run may vary over time as more are added.
 
 # Copyright
 
-Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
-
-```
-  Redistribution and use in source and binary forms, with or without modification, are permitted
-  provided that the following conditions are met:
-      * Redistributions of source code must retain the above copyright notice, this list of
-        conditions and the following disclaimer.
-      * Redistributions in binary form must reproduce the above copyright notice, this list of
-        conditions and the following disclaimer in the documentation and/or other materials
-        provided with the distribution.
-      * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-        to endorse or promote products derived from this software without specific prior written
-        permission.
-
-  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-  IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-  FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-  OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-  STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+
+```
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+  1. Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+  2. Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+  3. Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ```
diff --git a/media/docs/doxygen_mainpage.md b/media/docs/doxygen_mainpage.md
index 716ff8d4..53620d99 100644
--- a/media/docs/doxygen_mainpage.md
+++ b/media/docs/doxygen_mainpage.md
@@ -120,26 +120,32 @@ cudaError_t cutlass_sgemm_nn(
 
 # Copyright
 
-Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
 
 ```
-  Redistribution and use in source and binary forms, with or without modification, are permitted
-  provided that the following conditions are met:
-      * Redistributions of source code must retain the above copyright notice, this list of
-        conditions and the following disclaimer.
-      * Redistributions in binary form must reproduce the above copyright notice, this list of
-        conditions and the following disclaimer in the documentation and/or other materials
-        provided with the distribution.
-      * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-        to endorse or promote products derived from this software without specific prior written
-        permission.
-
-  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-  IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-  FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-  OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-  STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+  1. Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+  2. Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+  3. Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ```
diff --git a/media/docs/efficient_gemm.md b/media/docs/efficient_gemm.md
index 5c617796..203f6810 100644
--- a/media/docs/efficient_gemm.md
+++ b/media/docs/efficient_gemm.md
@@ -225,26 +225,32 @@ targeting NVIDIA GPUs.
 
 # Copyright
 
-Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
 
 ```
-  Redistribution and use in source and binary forms, with or without modification, are permitted
-  provided that the following conditions are met:
-      * Redistributions of source code must retain the above copyright notice, this list of
-        conditions and the following disclaimer.
-      * Redistributions in binary form must reproduce the above copyright notice, this list of
-        conditions and the following disclaimer in the documentation and/or other materials
-        provided with the distribution.
-      * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-        to endorse or promote products derived from this software without specific prior written
-        permission.
-
-  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-  IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-  FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-  OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-  STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+  1. Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+  2. Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+  3. Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ```
diff --git a/media/docs/functionality.md b/media/docs/functionality.md
index 8c43dd1a..b85b3a39 100644
--- a/media/docs/functionality.md
+++ b/media/docs/functionality.md
@@ -249,26 +249,32 @@ CUDA exposes warp-level matrix operations in the CUDA C++ WMMA API. The CUDA C++
 
 # Copyright
 
-Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
 
 ```
-  Redistribution and use in source and binary forms, with or without modification, are permitted
-  provided that the following conditions are met:
-      * Redistributions of source code must retain the above copyright notice, this list of
-        conditions and the following disclaimer.
-      * Redistributions in binary form must reproduce the above copyright notice, this list of
-        conditions and the following disclaimer in the documentation and/or other materials
-        provided with the distribution.
-      * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-        to endorse or promote products derived from this software without specific prior written
-        permission.
-
-  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-  IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-  FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-  OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-  STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+  1. Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+  2. Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+  3. Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ```
diff --git a/media/docs/fundamental_types.md b/media/docs/fundamental_types.md
index d3f7862d..2d493d0b 100644
--- a/media/docs/fundamental_types.md
+++ b/media/docs/fundamental_types.md
@@ -346,26 +346,32 @@ support on current and future NVIDIA GPUs.
 
 # Copyright
 
-Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
 
 ```
-  Redistribution and use in source and binary forms, with or without modification, are permitted
-  provided that the following conditions are met:
-      * Redistributions of source code must retain the above copyright notice, this list of
-        conditions and the following disclaimer.
-      * Redistributions in binary form must reproduce the above copyright notice, this list of
-        conditions and the following disclaimer in the documentation and/or other materials
-        provided with the distribution.
-      * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-        to endorse or promote products derived from this software without specific prior written
-        permission.
-
-  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-  IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-  FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-  OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-  STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+  1. Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+  2. Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+  3. Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ```
diff --git a/media/docs/gemm_api.md b/media/docs/gemm_api.md
index 2bc9ffac..01ea8b7e 100644
--- a/media/docs/gemm_api.md
+++ b/media/docs/gemm_api.md
@@ -542,26 +542,32 @@ to inline PTX.
 
 # Copyright
 
-Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
 
 ```
-  Redistribution and use in source and binary forms, with or without modification, are permitted
-  provided that the following conditions are met:
-      * Redistributions of source code must retain the above copyright notice, this list of
-        conditions and the following disclaimer.
-      * Redistributions in binary form must reproduce the above copyright notice, this list of
-        conditions and the following disclaimer in the documentation and/or other materials
-        provided with the distribution.
-      * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-        to endorse or promote products derived from this software without specific prior written
-        permission.
-
-  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-  IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-  FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-  OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-  STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+  1. Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+  2. Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+  3. Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ```
diff --git a/media/docs/implicit_gemm_convolution.md b/media/docs/implicit_gemm_convolution.md
index 69862b3f..c7c3d487 100644
--- a/media/docs/implicit_gemm_convolution.md
+++ b/media/docs/implicit_gemm_convolution.md
@@ -754,26 +754,32 @@ Convolution can also be run by the CUTLASS Profiler.
 
 # Copyright
 
-Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
-
-```
-  Redistribution and use in source and binary forms, with or without modification, are permitted
-  provided that the following conditions are met:
-      * Redistributions of source code must retain the above copyright notice, this list of
-        conditions and the following disclaimer.
-      * Redistributions in binary form must reproduce the above copyright notice, this list of
-        conditions and the following disclaimer in the documentation and/or other materials
-        provided with the distribution.
-      * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-        to endorse or promote products derived from this software without specific prior written
-        permission.
-
-  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-  IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-  FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-  OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-  STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+
+```
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+  1. Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+  2. Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+  3. Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ```
diff --git a/media/docs/layout.md b/media/docs/layout.md
index 5421ac4b..9a186c4a 100644
--- a/media/docs/layout.md
+++ b/media/docs/layout.md
@@ -267,26 +267,32 @@ Permuted Shared Memory Layouts:
 
 # Copyright
 
-Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
 
 ```
-  Redistribution and use in source and binary forms, with or without modification, are permitted
-  provided that the following conditions are met:
-      * Redistributions of source code must retain the above copyright notice, this list of
-        conditions and the following disclaimer.
-      * Redistributions in binary form must reproduce the above copyright notice, this list of
-        conditions and the following disclaimer in the documentation and/or other materials
-        provided with the distribution.
-      * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-        to endorse or promote products derived from this software without specific prior written
-        permission.
-
-  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-  IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-  FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-  OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-  STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+  1. Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+  2. Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+  3. Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ```
diff --git a/media/docs/profiler.md b/media/docs/profiler.md
index 9510101b..32206310 100644
--- a/media/docs/profiler.md
+++ b/media/docs/profiler.md
@@ -496,26 +496,32 @@ reference_device: Passed
 
 # Copyright
 
-Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
 
 ```
-  Redistribution and use in source and binary forms, with or without modification, are permitted
-  provided that the following conditions are met:
-      * Redistributions of source code must retain the above copyright notice, this list of
-        conditions and the following disclaimer.
-      * Redistributions in binary form must reproduce the above copyright notice, this list of
-        conditions and the following disclaimer in the documentation and/or other materials
-        provided with the distribution.
-      * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-        to endorse or promote products derived from this software without specific prior written
-        permission.
-
-  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-  IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-  FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-  OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-  STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+  1. Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+  2. Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+  3. Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ```
diff --git a/media/docs/programming_guidelines.md b/media/docs/programming_guidelines.md
index 542affbe..45efea2d 100644
--- a/media/docs/programming_guidelines.md
+++ b/media/docs/programming_guidelines.md
@@ -292,26 +292,32 @@ Github's pretty printer.
 
 # Copyright
 
-Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
 
 ```
-  Redistribution and use in source and binary forms, with or without modification, are permitted
-  provided that the following conditions are met:
-      * Redistributions of source code must retain the above copyright notice, this list of
-        conditions and the following disclaimer.
-      * Redistributions in binary form must reproduce the above copyright notice, this list of
-        conditions and the following disclaimer in the documentation and/or other materials
-        provided with the distribution.
-      * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-        to endorse or promote products derived from this software without specific prior written
-        permission.
-
-  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-  IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-  FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-  OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-  STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+  1. Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+  2. Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+  3. Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ```
diff --git a/media/docs/quickstart.md b/media/docs/quickstart.md
index c7a9a2d2..be274c7e 100644
--- a/media/docs/quickstart.md
+++ b/media/docs/quickstart.md
@@ -542,26 +542,32 @@ $ cmake .. -DCUTLASS_NVCC_ARCHS='70;75;80' -DCUTLASS_LIBRARY_KERNELS=tensorop*s*
 
 # Copyright
 
-Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
-
-```
-  Redistribution and use in source and binary forms, with or without modification, are permitted
-  provided that the following conditions are met:
-      * Redistributions of source code must retain the above copyright notice, this list of
-        conditions and the following disclaimer.
-      * Redistributions in binary form must reproduce the above copyright notice, this list of
-        conditions and the following disclaimer in the documentation and/or other materials
-        provided with the distribution.
-      * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-        to endorse or promote products derived from this software without specific prior written
-        permission.
-
-  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-  IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-  FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-  OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-  STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+
+```
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+  1. Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+  2. Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+  3. Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ```
diff --git a/media/docs/terminology.md b/media/docs/terminology.md
index 6533d8c3..96d51639 100644
--- a/media/docs/terminology.md
+++ b/media/docs/terminology.md
@@ -74,26 +74,32 @@ contiguous and strided dimensions of a tile.
 
 # Copyright
 
-Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
 
 ```
-  Redistribution and use in source and binary forms, with or without modification, are permitted
-  provided that the following conditions are met:
-      * Redistributions of source code must retain the above copyright notice, this list of
-        conditions and the following disclaimer.
-      * Redistributions in binary form must reproduce the above copyright notice, this list of
-        conditions and the following disclaimer in the documentation and/or other materials
-        provided with the distribution.
-      * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-        to endorse or promote products derived from this software without specific prior written
-        permission.
-
-  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-  IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-  FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-  OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-  STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+  1. Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+  2. Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+  3. Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ```
diff --git a/media/docs/tile_iterator_concept.md b/media/docs/tile_iterator_concept.md
index 6f5844d8..cba82e42 100644
--- a/media/docs/tile_iterator_concept.md
+++ b/media/docs/tile_iterator_concept.md
@@ -466,26 +466,32 @@ struct WriteableReadableRandomAccessContiguousTileIteratorConcept {
 
 # Copyright
 
-Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
 
 ```
-  Redistribution and use in source and binary forms, with or without modification, are permitted
-  provided that the following conditions are met:
-      * Redistributions of source code must retain the above copyright notice, this list of
-        conditions and the following disclaimer.
-      * Redistributions in binary form must reproduce the above copyright notice, this list of
-        conditions and the following disclaimer in the documentation and/or other materials
-        provided with the distribution.
-      * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-        to endorse or promote products derived from this software without specific prior written
-        permission.
-
-  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-  IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-  FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-  OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-  STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+  1. Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+  2. Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+  3. Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ```
diff --git a/media/docs/utilities.md b/media/docs/utilities.md
index 0ff6e8af..34c06acb 100644
--- a/media/docs/utilities.md
+++ b/media/docs/utilities.md
@@ -379,26 +379,32 @@ int main() {
 
 # Copyright
 
-Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
 
 ```
-  Redistribution and use in source and binary forms, with or without modification, are permitted
-  provided that the following conditions are met:
-      * Redistributions of source code must retain the above copyright notice, this list of
-        conditions and the following disclaimer.
-      * Redistributions in binary form must reproduce the above copyright notice, this list of
-        conditions and the following disclaimer in the documentation and/or other materials
-        provided with the distribution.
-      * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-        to endorse or promote products derived from this software without specific prior written
-        permission.
-
-  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-  IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-  FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-  OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-  STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+  1. Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+  2. Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+  3. Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ```
diff --git a/media/images/13_example_shmem_resident_fusion.png b/media/images/13_example_shmem_resident_fusion.png
new file mode 100644
index 0000000000000000000000000000000000000000..60d6beab43fc55d5158b59ec55e528399a5d3fc1
GIT binary patch
literal 18318
zcmeIaXH-;Mvo72QKqWRvjxInlC?H4{SqMlHL2{6cpoAt#Y(<eqVv~~~NCrVb$w4J&
z5D<`@B<Gy(YH;uE-tT?KIOp7ZzWe<+V~7msT65N{S+lC1XV${^o{|jyC8|p>7z|%d
z7Q7FGo$Z6Z?q4_y{l+PVc>(%%#{Ry{ZCFkl%`Ei8d2>lcNf<0I2=~Ac3;Ow@t*n+k
z3`S6m`8rc?lWqco$uP=+l4?)&7vD%UFv-NE>G19GU6{4L!BPw#^u0dNZ`^*UDK#9(
zX5aeU+o(m$TuV6>kMhvPs)=Z$i8^<b=(hR*agrp8_`>94!3%X39Z?==xQ&@Kct@?`
z0jUZ+dQ<dZrmd-|rRnu+p5Zj!f=s&d^N=bQ=K+k?t+znv|F@K=3(z<IbMP}T*fTQ*
z0Q#W=55&mDJV!zQ-{K>JL*Fd0|ML%No-Z5P^0ar}emK6gWNlwHv^aV6eDThbU;ks%
ziv=wek0-1PHhTm56bDMn2<gW2dste|I!O)3zxi5BZN_8L7XPM#XT5j`qsFc6;&1E}
zXRbHkzu=)P+@qHCU5)z^8D5U{R9+fD^v^}8+{SJrMIWAPDSU>}I?VVZk=udP&x>0O
zjUJqUk5w6GflzU}Uc*jHd6GeK>Gop+oAC=v@VKDbrjuoceI@lY)gqPeKs9WA&NpUu
zo|Xs!5RG$n>94eK3T<i1)^><GjZe(uZQG-*qjuwyhxS#S+#%X%W^UcXDeKa7>4E~S
zq@fD}p+g1Sl8|lR8t7I$=v7mEGQtze*5^ZBZ0kta+Ge(@?i4y)P2)MZcxc<HK()--
z+k@GOw>r9VJ>*syjh(fbhAx|TpCZ<`JT+=nNe{pFh?R4_39QOrsNcASL$5)m6gt%V
z8ly@FSI!{zl%2k8#r>s-po`5`_m0hZi7v0INmtoA2K6_k5#?|O0D?iH*5;?;awL5T
z+P3smt9MIM3w@f4b-Eafaq#BdHdZ3vFD3T(bFMZm6~@|4+KFKN;hD7^^9>zU^Augx
zfp?}}nI1FZV~{fZBC)2G`LTSWKhX7yC$6$r+taP9^5%iD;*+3Aw9hp@Gc3^uojkfj
zQjkfAIh5THD|V(C{QVy9FJQb#Y=Ygo6l}}POD$}Vpv2ud%=S!U-}BCLvv3lSN0<t(
zFCBUANNg9oTVlgtqnCAEZiM%g_Yzvpw~^^qQCV5K8AY$c@xHi+qv8rneay@YOw_lW
znD>j{oOsIePF1TB`{Nx|KF=FYjyG3z`1Dnj!dEgpeG0+>MAO{xnTxxgTXyEkB2?%f
z0cW<O&Os+2uGQ7kv|Air;{22fM=;EAFkr5Vx0L>iy+2`?h5PoUez$XEJ!jvL9BcL6
z3DOqZcOJWe*12k_AjoIkpti1G>{=OHYej#xggRms-ZpE+1?{LXBSq?hh{STC;9SFJ
zLFDu_zPL4|jOAy<Q~4BZ-FnI_Wo2on`|<UU-o5|g_N-q&8GEmQa@f(W*e9&_^L2ro
z0p{G*dhd6%y4tufei;1BjBkc*>xdrDlE$5>k{u(a0;TMjCHN$((?`o%UU!SGZ@iwN
z4R%+MyD3Gm&krC4_P&JknySckf2Ew@#fhMuj}qCV7vGT1Dht0GrdXCTf7~>asK}zj
z6koDK1}naVx^PP{8CN6Fgv&oFqEVg0S;ov)UcLQ2;z#I5u<W{mqRI9Gg5Zg8|4wJi
zR1u8-bH#TbU*X|#$kkoPYR-~|S1|3H@=ndIoQ=$mAE@A)OD$p(oPF={)P6v|l4q<V
zm889mYpHcU2@AG|11;yJ))f&iwdBLikO+NGQ~v$O`OcH8PR6<(u5Wkj&Y6tyM>}rO
z{P;MKvSZXEa?F_2&k|F?*xt_t`v4<?KT`_+@ss>=h7g_pCZf6iB~~NP!?A$pZ(pX&
z^<!nF2~0;lnPwfMvTmMeJ($pKNmMM;i<!u1xTFmmWrvQ!n_za<;S8*8^u=7)M@?GA
z{$)4Byxcy2D3vBe9?dN|S#I<1(-P;7S(i~}3~|>$W{R)b2+-km#^Qq6lAcWeH9U*8
zU@nEtypvb5RoEfHwsEdxNM5WB{xcR~|0*CZPl(G$;EjvZ{$Y$YX9*T8=PYox12-pW
z@@`W$+0<N|a=E|g%joMI1{tnSjdS~VDU<paQ8nyc@ySkZw<GR`YMHkuDw@aFe$=TB
zpwosuJOinHI69|Tn%-iGPK|Gkplt4iLKc4o(%O2B(oD|!(crxb14=Qqu`Xq6OM1a7
zA2QesJ$U{ti>}>u$J+8PeD>zENK-YkO>$h^PZ2fNcu&jjr{E=U0uwe%3I&nFXJHy=
zpl$J+jgj#?d`=Qgw_=ITy1iQOjIS?um-Iu1tM}>o>B_fesb?Y1NoV*(eeQlYegMd1
z;ocv);yfEM^lOY@$AhNy#2V%HROf15vhe}|H~$HMn791S`k^KQ7p9Aa$uDGI()C>K
z*eFC9X<znUy=8KD>vBUbOuD|B#^)s&CnhU#dP$OtK3=59Gj4xsA{#g$ZQx-g8|~#R
zu%Qd@cWZx)i)v88BqvRV`t~<i_ZfPLeVT*nHL;B;kE%e>AE#eYTI3~)1`B@X+0~~u
zKtY@q8QEDbTS}JKvX(!u$y7#2l)NG<TN|o~OOz`i01&VfwPer0T}oobnUN(ZJdS2K
z8IteMX`lzc!rhxt4L4^ONdd&=6Zr-^xG#_TrYDWd9X2?nxA_kqaFX>z{oo0Jmn9dd
z1x@b5OMFihfjz@Q=z?e8M~SDzMg({Pyu(g>MCf;z==|Z9(q;4+L6MTXD)3eMi*TV!
zm{^HBi3hjKVsd=<0kuGX>5pF~+rTN!hld8}+wOn_n1uKFtdJHsS@xi-P(>fC>n8&z
zVSlJF?&0pd_bD3MpTccK?!k}Ek9CJ~cR~s%gw7h@`txaS;L07|@r02fzuPU3#)C=F
zhm8JJbvPIQUGo1N*X@1vy18cyw)?*m`HvM99dL=Goe3Nm^G>tD5l~@yv12_~L1PHY
zKQkza!&zlBFoGIJzTzWsAekBg6FY8l=n;bsV4$Y@>a+OWY2?v#)zM}y;+@;fu10Fs
z5|jcQYJBKDSyf??tDQ3R9*3K~zkj&l<?>>!UgAi_8m3#9mf%?$)(p<<_NJXYiTTiI
zIJE))S@X_ZRxX{GE0f^ft9YbG=^Pyt9(ke?$0Uv>Bqp&xo4RigIm>4(Z5+&ZJ=lhV
zt?QVo-yp2mpB{Ozs*KzV9h{+dJ=&bjbc43G;$fZ9`uBU78Apz`ic)l+O14*t>p2oa
zr;}WKZ-()eF>3=3y*q=dhg7@GZHbe`WAV`-p*axeMvWrYrf4<}u5NT;7Op?q%$&$6
zFLVPEJ)33*W2p1TMeYxbKuXnePN6P{Rb8Zk96l<g9O<3%A<PO|o#H4J9ELKy`6{0Q
zu*EpJ5k+7kbi!z$tO5y!;cH%!XIuGehKK(~QsQ{Gofqq#SCGV!W-4Ttk(ZvOIRE*w
z@IP8hx*9qow;vXSQ}YzX;t(z^IU}r!XNU5)U2zGs0u}sr`Cf!id?M*%-kOh!^&vA@
ziWh{^3RJXQ+Z4>KRmI28+WjzUHbIKCX-y<5+pWG@$Gl}f)Sb5{Dmtw9Ved<AWZCrl
zlRAz56)=~X|Gs@&6~5W@>C4bs)+gk#n(HA}xuQ;Bk9k?Ki@Vlb-}vy)#T-tHcIp?H
zdQ?V1+PQ2Mx6t5yh0Z=O&vos|SDxwW<Rb5hO|3#+>jg->n%la4zzZX+EISl-oP5{K
zu|Jr7sG9ieQ-f>vqT<)=vc-dRB7DRPizUg>$0jVMj*?~s{sY_2`91N1P}G7w%lY7K
zpBbOyfK%H&c!45osWK-BOYg2-i1B9sAWv+b>b@$M%fi*PMeaxKIwvz}^tnC`4iUYq
zy8KPPs!e<Hr&`+~pr0By&`J>)u|BorB9A$(Gq^fp+oS{C530iM^ti{3g|8p-&LXS@
zZ^Eq~E3e#i3ZIyAFL4_t!o;^z81OlOgwEo_WT1dU9b(&^@ISHXxA?@557xIwkjKjr
zjUvnCjH>bzL(U_I^~i(z+{(SX`ma(k(XRu~_{!QQxve9exH~J4lqyC|i&rkr+z;S6
zZOh1GHy6s7lNIuT;L(5^;n8Me$XeA=F4q+9pna&_G#rvIb|gY$V|ElqgOYuL85KqV
zCQF{gCmi4#112IqI|qMsvaZk1QT!*>=;$qw>O2Ph*fIe4m?>a4|7VzgW1Cx&PTS6e
zbdL)WRu9WuRz{aRS&Fb<33UP00T%d0XtARo)~oPiI%JBiJ=>tuI2XM^E(O0@!l_|z
z$-Uwg<U#e{HnV;#U}uV3XgL4Vq1m9n$~;Pfds2CtB7(9{BByG~>Xww_#nbJ&%*^>=
z>-*5-z(J~1Ma9tLt6vq*hx256O}w}8o-w;r_OqgDuxH+?IC6^&zlY{@uUVV#rGGs;
z*Q;E$rMP~Gg?Ge@UA5-@z_y&Uw{B;PM(eB2%8qafB^~<J9Nw=dS}N}cc=K`^7pzff
zbdP?wH8$8eV85v_Ks^S$f>u(uxS+9F<j%ZBD75sUh{P=9RjK%?gWZeCP>a5(@$;v>
z?xWaC+V|j3I%6-q9{!LhhUl%7z4YaKh=Pbo+AGs-`<(iXC^N5Db7y}|8Q9IfKB75g
z@{PmD5dO&@*KMkk`$n5!a@Ipp!HO{Da>m%jRY!IPLd&CfiF>e9pM5=XY4ZDlh<bK^
zoR*N^>p3;ml!FrL=qAeSYh}?uUz@FQ@jd>$sitDFl*GL=r_VGQ`z09~d`J@V6)wqr
zt>Uo}so1Mgw8_|f{$+EE^R{*iy-<vllpgu1TItWSG>^)j2+FW)=AT!RMntV>&_+ZI
z?EVlMWZaoILlu!M4Bbdf+&lYk4$-sl@i#9nXfC^W^Cc#3{TI%Y`>$Xzf>&=FRW#zc
zzu9YmT>R8K>^jdCjpZ`zwp<|-L?_2GPf?I1B?p2zlu(Rv_xCQAJoA>bZXh|-5%1J<
zb{cNs-HXvM@OU7sRe!3;v-bNq<MZ_a@%&*|3*Qr=UrT}*?9Y!4?}Yy0z%yvUWEv>V
z8xz5w5Wzi&&_Di1;J;!A^BcfgqjFg!kF$aMV(WqD{z6*WVJnNX!NJ<vR6DgzU~I`@
zORRCxM8U|<=Vb&SBQUfq3T2dYD3kySC7uOGBM<kE`kNy?SC1B^BYBV|$4t7T1IzWX
z+#@!vsfW>p58Zk^G;ditIrM%wT!DvnqGY#4f>5x}FdzvZNNRTlYiRlU$d_A-s<k_>
zJm0zYETewpsC-l2Q@2@68nqmPplgYiVFoIGpO%9d{o(tM7iZc%qlY(S!iUY$y9U}{
zA>V}<yo;cp4fgn5)`}Re;XIV_I&tZ!obo7qPmIQxV!pJ7Yoh7EK|}My`!oK6=AT5T
za0*zZlb7GffUt+K6CX2w0F1w$*Npc`5~sbkk`qC}r?%}eMlYYw)kmkF5olQ=mPDkR
zg!WelONU(`$HqSO#P7_2@dLWM#h3L|?Gg?h1iSTe*_%UD4dd^6UK`O*hxZ~9WC^~h
z_{p~aZiObW=E0N=(kYE=A#S;d2idO}dDlPBhyM29S;+u;0_jO5cZ+xA=s;&Ws0us-
zg47_G@X5D9Gr1y^V5{$z1MHzwesz2%#hioMhsA^4)mJ14FVsfaI+R867=RxPz+Scz
zNKESNX=yB8Pb+Z^`<l*B<})!+#Q-cF`dxb#p!#CqZ1*eq4X%`G%HSCjAK&f;moJ6(
zucmmirb-F=XWJM7*tKUUix;Swg4^&}&NN?Hb1090#`(X0GPx;Gi@kU?27XNa?y>O_
zo^Hk)hPPvtJ%6-{-(uaJOj|72v7&cw_w4sv^(ZetEM%tF;@t6iS%9cS4Qfh7GXS4n
zpu!+<V$s5UF7Ghrc}*|jWM#yypVapy{7j3;JW{?<Y*%2(CGp5c_4N9hR7hNz_G<{Z
z5q1%{FM^U4f(rOLU4A0@>#-G9k%*fd5ysI99_PL#R%82;O6<C>*S6nxIJ_-BkW!{x
zIZ~SBY3N&V3$)HkRIt7SZdJ91e2^pI2;++#bdOz?_24E&cRFxI+c5b#=?lkSDJU0)
z_GU*{fT+3eGQL9B3{7u=uGkjNMt4R3S5E4USDP(TlR<Ln_P(e=HL2)96bkRjObkC4
z&C(P~-%kqX<3`Cnuq9rZCvWkhozL})PQ#*K>=Nvo=b?i4S-bc0g%oQnsSn#`9NFd>
z-g<-V5+eibvD73a&aKntb5>53;^e*!TsOiR6;UV;uDafaOGbkh)h(mpZ}eBc1H>jD
zCU>uY=x<ZbQ-3oPHwRT`qxOj?rn!hLmv+y*@OFCJE*Y#>#Q{-?t?xzXCtmmT;$He}
z3LvELO%b?&QLKYO)sr&sR&fg-gV8DfmJgGrtycH%fk`ZYuk$c26h5R&)IhTbU2jDC
zJXR~*bm#-ReVYX7U>|gXoex%dOZJ3E%$8Kvf^u5D0fY!NC?Ak-Gd+{qB6qRA<14K%
zYEiA=uyL_`W1xA99x4YD{G`Fm2~8Qj=N~7+I37)O>J@1oDVD2M=85{))S6Ht-yZKR
z@s(inY2MV68Pcyczopci0wDPCO}$XXz1REw^?XtD*9zJq`dv7>$hYuGFo}wr9Q`)1
z)K2jgyEESBfRg)TUPb-_c9Qs6`6k)9_D&<OMuM+r<sSJ+{tWWG%}s{ZdBjQr)>Nj>
zn$c)ilECfWaD}LZ9<;^w#gkC4$YG91+B4KsM}GE*ePh^thUtl1gBSMph|SH&Me~Br
zdvjJ^ey@~5o24i3P&G}OfhHsBXnj1vX{_O^+H2)uMDX5=Bwy_wPCt3<EcWp;8nY*=
zx4~55y~87i7iG;4%FHF6ECiERd2ExdOqadK^6{(R6DEks!A@&4D8|P=$lu>eYxSO}
zlsf}233$a@f(5sGA|jB@HMjpmw48rkz?o9J@*Lbw&vdwpV7WYC@hRk=my=Ma;v)66
zGr<onLY)^y+**)F>N<BHQQ0K>&Cgc`)-6w)9AUjP;bsOWuR>a11#2RM#-`TwZ`kwY
zQhA_Iiz<ynvWo{Dxv4ou`*V`2Lc|!$zD6~u9pkCMH`y%^4k9&`PHpq^p3R1Ai#nin
z5_Ei|K!vT`7>y_#Njcg_!5a*~+T4Bvx2f~+^2<m~)J4dD#7#kyy^poe0b%N)y6SVj
zD#~@+kILnHdRJL*-{f@S28ufFT@_>mS`mq;kNB7eQn1I}^TNJ~hzv3G*m+VJp~4#T
zwA1G2j{=4~J(mH*IgBrK69D%KAfSJ}7TZYYY-sJbh~fS-&(gi>E@s)}RJOH?IPd`T
zHqbUFI}xD~&=v%+2@=H$ZbTbB$~1O|JrtEf3lEI~-d$Y5xw)$@K1b3fn|~Wuo)(px
zU<A!330mU7*&smPL2(kz>Hiv168tC$Zv6jTpyd0lgUz1<27}9aluqrQmB*Z$@3vG1
z2@9r>%dLl3b?0z~*^{9#?6K9g)z4Gtjs5YxNr**yuAsp|xxGY`xsah%<Zo464lhwq
z)jn0o0h>F|T6Mv0t;usoRKpOt8`f%n34OCzn2A)%z`@&|aNNH_b2mLIjXwX_g|W)x
z_|pO6u(>67tgM0Ohq-6kQRs5f#K7SL6zXa9(m=x2d;ZKuG)vT@^fDgLVb9gWblUy*
zwCh#Ge;GMy?p&UT6;Whkr_!3u_S6MQqvsxqj@gslEi4jGCRp6c!xh%>No5X}FX2f4
zRcxki-QujZQNNnvZsupS^|85lyggQVlJ)CdkSSbu-lb8XTh!IGgdZil$N)@1#1oaO
z3qxDk?LFDXr~}i?YD%1v8N?<4gxJlPG_B6NY};v9XHY_}fPuj)86rf@Q@Csz!DZhf
zeP{0e{2>kME_Q_TTNY8Yo9m6)LI>fmG$jq*PuU1Y-K@U(lx;rP(GT_WEc}5~<1Xz4
z$B0sIjymMnf26bxR$xi?^Xm4J6&buFp3!-(NxJu5C<;$hr1;DI-z;(jg2>G&S80%H
zdPOLbgj@9n?uf@~!N67Wti3;q^ami*15SJLTMg_nf=gDU@K(9#P%2CR@uX1R-$rUW
zmsfmy{7uE>+Ty@cRpL@*r)*VFjm1>SDxIy@^7>z{pCL+QJ~G(sjl1fu!4_`Q-^&W%
z1YW@s%1DHGhwG;>ZdBEcp{=~CT@7@O0{43o=1Jf#cj|d2ZoW^fv$ZtC{sk%2=u!1W
zJE@jUx#{j}6*O0)dsHBKE;1_;^y&mxJ<t9<{_fuG5p0Pfx9$F?d2<<ih&z96?9WT*
zR-fKuGCHap^z}ZPYj%Vf;hYAdz86V}2%U+EvMaxIu}^vabSH1HBJgR|^|GLyri=94
zdZDNW`^Ir!RA!?1Ns%DpJlUA<w&k~NTps@7uMl$*3QB?eoWIHxypxic@UN{pE#3Y{
z;D79JT8;jn+o9ps!Q6cH%q@b+fFcXlCy#Zm0HnrL{m)Ulcu+2iavt&M#lHkIl?2(b
zPt#KrayJs`G8L{dSPZ)tjND5@hA4gibrk>aQFd^2t%rTp{+H*>OC^oP7}x?E6h5et
zl)~_V#g03^FVxHT=CUBdKCc{4@q{+~?`($nkFwtX=WQ4N4`-5<{*&x|Yy69>X~g}0
zVXslH{BUc)rhK(d$EJe0=t_0ja^*hZ@d_cm=gt_!(lG12mnZviOg=>DHcyR`m2NTU
zJvS2%aX)(cAMSK0zhx(+7Z1z%&b)E_%Hf+J#NAM=hG|d>|LoB)2q({mZ;+%nVMyXx
zOTF92HnJ=p@0{Vg^9ILz(C3F!JjmUKi5D*pR)dgX;W`sXpO5yB-B)amgEpeJkK0$$
z`g~Cx%3-^$I@MWs>>rK0?f{^P9nTD6KhfVUm+%+>v?}+PL>NgR9r#;K_R{VA2qg~g
z(bO1qPr{(ft9OsnZ(NRQdx7G-4$&7V*$M^#YgPqRs5;o_QVs}gSggJMd{z|IKn%fK
zC|E<~MlhXp&Bs6mE}BaU(~@8WRIWh@TAv8*#Q=~des-V(O?*$vwK*&}+Xdi0{J&d_
z5@L0$o5PF?!lwu%|Dh`p$n=5~9&SDPuZo433$r4et3tKX{`H|r*8NcaG&+WVeJaVU
zj8b3l#UAV;R_ZzfAP;3IC_V9@WF?8vs&?XEYcs7b0Qof#zrX-&dZ8|#hj)&9_}_7R
z{pxI&3lW<6l&gaXO9tp(HdU3E#o*&zNarjfbgOC^dRk(zb;)&QN6Yo-9lfsduz38?
zrj7DaLRB&Ha;t+2;&?;)fMw;Nc4?L{_UW#Tkw#=~{7b(#;Sktt{Y3hQ9-QVhXMJyh
z!tLy!N&4G!zOSEdPx&3VR~IPWM52a%EH#^u@;mnCM2<>prkU3|0$K~oP0U!M-+x?!
zXPjszi(M%*iJeaXr{w5g;416>s+B@xHQ8nJbytdjmQOZ&XL4zkP$#3<lQd!R+dG!n
zw45XZT>~C=nBaHyQxB2^d7sux-M7t~qsk|ShA~p(thc{ZU=+boH_YDXR`5VeVCKOp
zgy;!n<AR$neSY$n_E2xo8*?szm+bxM#+Ro`Ux>&Gy1#N;hc8AJ*3^-9Gim5fuiI~m
z81!zIe1HwALGk+3&LuAwe~u=?!Q}PqJ0P)iGif=wh|bmlAHl5DS*gK03Z0JgZCN&X
z<H9265O=w`U`5@3>dJivuI2Qd5hw_mRiY6+Hk=slT5i8N*)+VfG-53|yj0&j`Fh?x
zR_3+Uc;{wGF>LVOU$dSEP5QOaDA`9ee(70(Tdg0n%qJ*uAhb<e9#)Yj;6tv!^m=~i
zoz3LgmWN{waN71~AItBUG%SAnbGzhkfe~+R_{!b`^+?00eiFcn84vCcj~C6ZTT(@s
zG-~Qjiz3N9#p<$)_Y%%WOYF(9-Gf=DLKLv8&@kZ>`V06y<@u5lc^+(t4nwn*()PV?
zp0PMfO*n8BLv5K<R~%?<>db&XJb-=b2=)w5C#Pd?71!2UDW^C_(q3Ro&Q9?yB`aZC
zjYy}?qpTY!)RV>!Y^AKzuRSgFzK`^becIah)ngE!G%E|0=vs>3)_sL#LinyML`f7E
zNcsu|f=vP-e<52QOa)5wY^Ew!t!d)v29jba((X@ajMvoM)DJqB!0|aFl-zf`)a_{D
z$WFJrT@EcZF8EE|iU_UWT#`z!E%jr0PTzj$?3SJJ?De3d$NSw#uhn=reBu#i5&D*B
ztl!si6<BJmaNlF3xT8lpC`SK!>G5dnoCVc#Kq~Df-fFNI{f>#}!PQT)5U39L*uxGc
zcOi1-e7;1y-z)D^JmhlK7pT@3J2cx7boIj8ruqBG)@lfFB40!N-E$sIjz-UW)R7xL
zC3ek*wC}Za{Z7Uc25WBo><g&~v)7aO7&K1Mf)|wUA$q{!CbHqPswF$fyCkRR>|HiZ
zqix-`CE7z}M+7HLbu_UY!?^VmVEj(Sb9bx&>`FV`*mgnmS0UOFIzeKBjwPboPbtcg
z5ED_T?Td2IY70N?u(lkG9%I96ubo5R$SgU(oba>woYS^wIj!H;;3jjGbBY%#{n~p*
z)qsAbsxr5g&oX2f=T`zvo+>`Ln^D&JSunh=-<>)Gz50xTtoKn~|FgbNs6kNYy)@{`
zIsL<vb^qGJ2ho@Pm9vi3@1E{6$Sajy7yI(Ca&zcuyT`XTd{At|?C+<V!#@ARDWsrF
zsb8GtLjrvJ)MMu9=7%vfwF>l#c=-aoM*(YP_3eYb>I9<5yhFrM|8ti?DIevrPQfqn
zR)XP;dteQ|!fp;Rf&109y|$n7U&Vz2UH7lJ`P0(l8%u<>I&netgnAYnJ`kTeCBi;E
z=V`G$*8iP0InQ1}Bai%w^ov-$pLEvGM=ATQ1>RE(ra(OTiDo8cc4Xrex|1d(LZ3c2
zTp-<2icRD&pQgUQMt7XI8+|yo$aYLU#d}MJMn^9`rb=_1uxIU{&n;3{ofVKOn=;ra
z#tk@R>HNebnLtyfXgNx4V7jf{&6!gRPGbMC6a$O~s<IF{%Rc$?!`G%t@7}~FD7)N}
zM{C@dWwE{BuC<meCIhNPeMe`!s`&|b^$iTgRj`%~k$fUx(R5%U6ZVB$Z+V*I`(D7>
zqn+VEdYSUjRbdu6?tC9lRsVL2<HkD$?Q0=%ducQIci>vcCf|m=0suan6PM6II6Xj5
zCq@ICwjV^9$1gv2+^<>ed-^?|5nxxCZ%~tpK{aSvy$ie{<f*%39&xSKO((dx*U>;~
z@P+A%;Ss5ySG)v~I3fBTA4zN!e0k{|UWt|<sd5qSsyJS!iQbABJ>0a*cG!WZ+DSp-
z!4gYbZ&sbLFy_8J2(ylmuxf{&xNYW@rP5%|%r1WO%M@%Yl3KJ@@2@OmoDiss$akVe
zKI4>HLFlyX-E+-?Usq4_VS6T+3^C4wo~Tpz#GQ^i%g3{_)|+25!YWbdBNSw}ZFZwJ
zh}CbLNd-8qEDE^cvm7T$^mW`O4jH+P^U6bXl}fx@*rDE|Am4=9=MC00TZxbVkwGaJ
zmX<Bw#9Y7F{DJSHj)h()Z4gOu?V5}T2)`3_(a-fZoc-X=?wNs`On_1?zxDN{I<09_
zg?|ikCY6Vunn!iW;O2c5@a&i^md;#-L1GGaJD~uyt8;!Pm(fDZ$ae>#aJ!&%TilY4
z$rY987yqbygyJ0Vz8ZR%6u>!D*iaLL5+5pTLG?Td%@D51_3Dcm=kOlE*$%0vrctu$
zAIANa$9AkI|5kHJf{618y^h>T8btN|$cOK2IS&MzCS+ED*f+kYY{UjbeF}@I?%!3H
zeYN5Sw(XLkagRPvCRdLm-{~G)wd(*1`Zq|q3Uod1*I~i6j81OBTqVIvUMLd_b|93M
zA}Mnr<Ia-p?T8AK<iH31Is{q!(m;rqHTtG3ACn{5qxkN&1oA)2xD+J`ntFxiDJflE
z6zWo!FmF%rp<{-eTG}J-opZ6J#=`<;)24M3tOe^g*1sm?Ol6Spll#gpb$E;3J23xA
z$MNY{|GtQe^{ML95z;0!?tuhs=Balj_>D(D7CDJ#j{&Cxf3oF<97)IfX4Arf^pPma
zcc&)(M#+my`B8!fKjYkpYr3wMWKa5*tuU{4$8pgfRm;;C3JIcQubLlfuYLL!gbF+4
znxQJugHvL23l8?5DX7_xVBOpwyOvBZrtER&G!kdZZ;<EcZ{V28Kg}oCmWifi^HpP1
zN@o*-M{DfiO4lF_h_8((7y01OTWlorr&TRWg7+zdguJ^ch}z{@fRNd~>0>g7fWb`E
z`87j>3|<?(1GQzJU1D3qU*mIRKf6#$*rB_HV|En5t8OT4q3ckuDzt0F`D%=;!LOl`
zphRNAPuiqmrH65ty1v`T+C)==AC-nXNe3q1N)2=v&t*2a?tyBs*qZXhExlu{1||_(
z<K&I=++FmvI9|#;=)7_o1_)o0`_15iGYfCHeP5tz<lLq$SUk#mmnTuE3@1}0=2<Op
z>nD7OGpzKmqe3IQ3;jz+=E`R`4=+f_zJtM`;$HT$(9U)0qsgZ|v-NS7$>eZ3=H6%T
zy--ZnWOrLVXZ#BfW|_!A(EAZ>v}L31)vE^$+(3RI17JAmAZ66Rjd+!FPnZlY1bl1X
zgs8o7v&8OspSFF!<(04-s0M4dDis9sZs!&zRXte^^wm7R1%4+3nNT8BaWrS9a=5uk
zW;H;t*R>?7ZZblesr*jzf#3Zs`qHXTF*5WT48Rd-tc*~G<??l_CN-9)%2aSIRU$M4
z&f{}%q4*_kqU$9paJ%}gI&veegqRHHfVr?w5K}tz(=+R{M6TW0pKy~*J|$9OC^s?y
zP?e4gM0Rf2ZCN(Ge~sdye-YMwQ%gOCzDe>%!Wz<ng9trTNkJ%pNVTM?d#sbHZ4_^@
zq%+j2hiahmLY<cc+X6o-%V_u%4`!pH{OLt<9sM(V8SU!}&cgF3s7qqG6IhoS&EE0$
z5Fe}Zna7*n289Ws+oSB%UB3r#w27-I+XSNdgwB%WX4_2+8hlb%#(UeS@T%&Y(KjJN
za&kmy2#UR=tV*^zDvVcdo$Q*Hql9X547|F`x?4SM@p=!{@2{{~Kf@Fkqa{UzzSi%&
zrRO1b#{g^evtcp;MA>5?8iHA31?{KQ7u92$$#W12Z3Cn}*jQfDj;xaP+faign+ij^
z{E{AL9@$10aA(gbf?n&qX=2PLwz8|+-sT~#iWp=+y!>nx(Lkr%RD23*k}VoXurvZh
zi$&I{TU#=r)hgdYB*Y6rR3n6OT<Tr`a6R4Bxo`4kWZl1gdq-#^1!BOUY;*{Hd_>8{
zZFm7zw52bSX;Ohi5@Vo{!13M?>eQgK*W$w(pQF+x!JN1rUs;C>$sIB-t*=Zn3EPes
zcuodqHY>pz-IF{SuOXVbYT-)og#yXwIYRLp1rR0r8y-{Q-mShKY<Nujpi-~ru}12i
zmvU=1*WZAws(d94oYn0D)7Qraslp7RJk@Dt>ZE@LK1k(;Xdr1KIkTcZn@yE#OqFk*
z@ON-GX#&4UNJL}S3FMa@pa}no2!(|&@?uJGGJ=180#N|>9zqNM4Jb|uGb77{ruYM(
zL#BQ6pNJfypZ*EMiNIvw|3k5Y3vieH#iCZFm{N-^C&}HOT%SU{LF?FZ!(%V~-JX@@
zWzTJqlQuWCmk`u#8A*mfl{%DrbP1PrL4D*<rJxU4`PL7sHC}!!c*cc4P0|j4lf6}i
z^<;0cT4f|~Ai}`8VKv+UUy5b++R|tyCC(I<)k_kzLDkl1Yn9)2aLalV$JT>)C712Q
z%0sJ{h4k%mS#=|OhbfMv!F)_D=3RZIUix93q{|no)q#;CbtkO1XW!6xwnHs*U-YtF
zRRILmNkLw|`!lv$9^Yd<%i!=XMAV47@7%-4E|;$dI(*``OEV*L&RG=9+Uaij0h$yI
z4dS<3jY2O1h4xQBEM!o{^|3#GlNM?PitmiRFce%UzGc%{Q(?>B+sQdj1z*Q0u8Aau
z97u>+!XfazUEt6mzj4jr6~6?3Egi@sPMp~r8?J(Vin-Gw4;Ib(JQ7;#+Nk1c?3S$)
z+%#*lIaG0RFaf&Z_6rp2FZLx=zH*(&ji%<qKw$mtujd%Hh#E_-Vj{@DibDRq8T|R$
z;G^gr&|hz<rGU>&5pph+EX3Q8QGp=kX<&(C1`;1Ky*UTJ4q+4sE?N-5O(Ar7QZcBV
z|Hu3oxBK4=piKlJ?|B9Q7b?y1{vaYSQ2uY@eKT$pgt7p>oV!eHP*dJXu;b)Mp?Lpf
z4K(3G>%nao{>3~I2bh<!ct*_BaWNH0#O&LKl985IC=A_{C<%uBiU=9ZBl?Vpt;nMJ
zYe5XEdLJa@?B~v$hW_6*gWu?KES*aW`p4rJ00euROtGdT{@?7w0BDy7by_HQ{&P8q
z2owdWiXC#;N2z7YG!qxY=ipL6x_<|N*fb}Rc4Tqv(B|lB!N!4%L~&cWhiKXSNc^A#
z((afwW_E4zXx48hcXZcpsEb};b#0^ud6?f9YMWPpc#axW|3xuOD!@DKex@0;#_;xp
z@ew=+F*p*UN7Oo=gVN>uYto*TWBpg-Bx<amY@6m^iYc|~m0tm}y9&mI(VSZ>Q)l5N
z?+b6A#!D>#BD{tk1v@I$ih3#Bc)1KOc@B{!b$12anz=vOs<YKo?QX={9tBI);}LU+
zm$)5NtFz}A)S8!V%rySY$eQt<n~!dFlz-xVnq~N;Cw1%Z8PZ9nQjOCR%6);zuLf7p
zBpaoYN7K&M@>(m0!YzwwxmDe?|E)$XA0b+T!R|IUF<Qrx<4?vAN&xMzK@(2xFS9RJ
z<GikLnw_AF0iGwm0UYUaImK<$({s+1IrH|>sK4<pT~s~<0T^Q_0wJv0IinSQ{rS@g
zcCz%u6ypL$uXA*q^*#JZd;2MaRiQiYL8U+JX#y{)Dk~f4J`8lmc#;UJA-{kf9v!aW
zJU+ThNWC5JiLAS#u~p}M?1a1KB%yy5BAf<9@9UBT<+Nj3BruC_wu9%<GKh8+xlqyB
z6wvZ`dbGB@ZTF{rWK`99&Up%|21>A@7?bHtgf=zT1Ar7e8ed(<k2a}#V$?FXPRJoV
zKjS9^^`Ahy3b%~_-Di}XaokB0l)<<;Zr4gQLF<(F-x+{H%Id<!q&SX1RG&r%M8$Y-
z1=;jtjSri0^(Q=XZg=CZKAZ2;sb{#{`VWHS%eBcmb;^*zGcdukcU^GnEMkjeuHt;^
z4;EWJ(AwFgC>WnZ5}O>&uuLJ~CFnx!7pM*Gwi{_?k0tt-Ez_&kiuYr=e|9E3N!U}q
zHf*0OIbLQmO}I^YS_00|Vkoaz)Kh8x%&>iUN1*Q5&82bj(Lox@9kZyS__2qUK<y1l
z(Y5&avjBqn$)`*3*^hBqIq-{xq7EU`JWiqGV0*J!A4iu|OI0TcXYuqeSaGry!tm(2
znbvD+yT)-Gjyy~B14$zI-uwQzJL^N=f7)>1AN2_nRngN5QhYe!lCCYpM-u}Gk|&=C
z;IjpB^W5s6g2we%$5aY~e2}I#KH?;s+(rIw_q+7QmV|5~VhhBSKi#4EeIW^`V#&MW
zhj4$pwTQ#rdk*F{^l8E|bt3!LIO5fw3)FrVqL$4EJ1rjQ%W$-gMOHXUHgZar1+Z3~
zXv0%B{KjadX1Gih-+c5aP~@R=d-Xt`MZ2C%Osq9TjAJV1Q*$Mtc8t}H{7K9hAB%pg
zU63iYlS6jX714a}T#}w7`(eoid@~l(c->G;SWVC@6}3n$Mn%WGp!daELHSyg&AxK^
zw77~Y(~Dn-8j}<_F2lEn?HFh#6jHBqxbCyQb2qB=z1;Tb9-ZJgdG9LG?VC>9Jb>US
zuM*`EZvP%t?D4eF5$DkYO&89>YT*ZRx!yH9-1F@XTM3V9N4cT4Q4CX+d41rJ=YUpt
z1w94A(Qx)%!G}kPkoEe?SgNlO--9116?4d)E*MA?;5IYL)|W6BJboRbq_>)PQ;>7n
zDj<Ox#61-RI;2ko&_1O3RDVUp6JGC`pTJ}9N`~Gn>Tv~=s}MP4XuC+!O@}tBtt7)p
zFQGyWJU7zC_%}TA@t^*12jXpGzAbr^p~+cp#2g&%A`?zcwq9+zvSg|7_|UWN3a@1C
z2v&GS&tX3k$Mdk~QDAd={$02okM&wl-nsE?`3Cv8S$36KTDP_X>9Apu9$(Aap5!o8
zcIA6@CSWMsz5>NmC?V?F=LhvaJM*?X7nXXA7Z%0+Ie*Av-7)+A7ilE{bSPAb*D($Z
z5LYL(C_B~ID){pn<ZUf`#qU*I$oH{R72PwOr!Bm`&zakC(QVxH4ZwR-H}!sudO*q(
zhyL;wxgG~x*3!7J*ShB1Xd@tht57=>RlJ2qWv^d8ZJ!ywP6C?D2zB~VLp;o)c7Z3e
z+szTN-cK!S=8Hn7OxF!2fuIg~hd9pw6rQq5eGGv>-IRcYr(#yJj9_ojw3w#A%ptQm
z9YA3lWRi<Z(3;SCFr>_#wjdW5{<>jLRS8s&e*m>5P90pv@GQSjya2!((eoA-v|Noh
zHlk^1NM3Z8yb727uHl_vmPf63rQ`F>EIK%vf0ZA9Q8ajea6%~Aq|bhZJe$svKA~&~
zu?%f+u+9-jC@#Rx@@J~o9~BFxVwgrXWNVi>A7t(5-FuHpA2Jf1wf{aw1g-n8<PF^w
zp$kAJ_UbmKh37siH=pZpB0ox9Hr$B%kj?^5UaG>)ybLeuLmP9gZL|*GsiJqfON9Qd
zghDl7=%7r94qElDrhL)uR(0DvHwo3s*iGKZ3fNm7ai=SRgN$?{4cC1eZn<BK75Kfb
zA_6C2UF0zVkr_)oG#P9_{3(3{T)i&w_?kpqu<++#GV~8f2RIt){GbgSSD+UXpI^WO
z@;exSravxw8po22i3sKyV*@mO?@H24ziwcn;n>%^=Z$Ize1mu?wPm|xQV;Pyzt=p!
z@jd`RFQ+%K>_tb8h8P@~B%zKFh+O|Q!F@%ruE^Uw;86_2FBb6YGF)g{2jnJ#A9u^C
zg^Tq>(Vmk{u<b8c{%#NNw-Is(%zBNo`0iUTK!lEk*m^u*ZKq$T{_c$oM!<PLckh$2
z4L-T#F9!1?DSFnJ4d=13lfIc~Ag_o4pM*##F}lKcpT#1F4Oq&%zmH_i_nF2MJT;sB
z1YlHOVd{&b7kL=(xH$k3+^5{+R&UBxl{cv+(Cc}cCsdAi*zSf)K)QCCw25|0EK(Y7
zR~Xh4cE#C19M{v@d9Pu$anI<NO$0fC-ED;T$;DYl12ZtLd*u=7cNH_Fo(uur%a{-n
z^zOx&d0$l7_UO5Q_{-Jx&KzbG9fUs&UpO!VTWLVbD848XjzX+M_`HEK+OS456O~-Q
z3-vXf(6rp02F0#u6do2s7jD3v8XVZ2!*+TKqH0^*NokXBSl3srpthDX!r8}5EM70{
zkkf}0%Z-v?p!6Oo5Yo6)Pi*&nwS=lf3y$V|06wnD{O<P;%I7MO>`H`|5;~5Z&rG9+
zUq4&1srK|(s%w@nSp!0QgN(qE;=_Qzm#1M|j7Xm|iI3>a2?cX?-)rs`xN(s{tPtAr
zhj4VI+N6a8(J$7;mxn<~;w#}dR_Iy}!pkjB6F0V2L#52Ss8Y@0DsN26A*2PG(EXDB
zoug^QOQW8AEe}pk7Bd2IF`X|^3w*j2g3&&Q^AoCk@Bcx{OllILLwV!B%}rpssq)Ji
zfeDt2*mDj{zjE36W!?v%3Dj>8hz0Mw!;SAEw0}643UQ$<z~-;y^u{hT_@q!4A_Ku#
zIS{+;&?t7i#1WnW!3_qKEP*j?iH)P6^A!NPDMfIi8stjcHmY|XROx1CV#7-ie=t->
zX=X~I%zR#$8Xcon7hfE~wQgZX+xvfD6V?@_#<*mr6Ep(pBg9C1gX$0H(M9p*r7GXA
z87L(|rE`A`{U;$pg9{*BZ{DznNNA`L>Etd&j3_QrXi*z;jmQAhd!cCBh9cX&5(Ydr
zTA${z8?5I1u44SNNpUfI=slcYvi>v&QD5qvm7#{xzm|ax=2JWcQF#mi&N}AmbYObI
z<}!7BRiBGQ-N3IqcgUpnTR1Lo4FAK)Uje-oa0+Q48&SDinn5Q24Y4r*t{7AbarwUo
zl0c&PNQk4zy7XtrI5|g+U((FJ)Biovh8P!#w2O=WD*$6ZvG?Dj@~JU?$4}^gL3UoK
z{}57yS_DoQpFd*S=|RW9y;5SdhUbk1yb5WOKY=MEy}M|Gw2H?yHZ7WkUY&zpq{Z7e
zOf0g1k_(2AfZnZ%*tstZzdkj{zm~sNS3F%^1$kJPFs89#P?Hza^U&^{mp>3yC0C7k
zeUC?zQT%X2)p@1mPqbeAYEBm%c3*F0t_-VkakChh8V8eiXQpRVz@v4E<F13LgC*yx
z{m~bMM~5TFhwp;KT{mVrHR$qnw~M8g{KJP*vkYT~p)P==7O3+gG3~=~{W0ASr^NEL
z*C<Rc$4Dr{@utDCY23s_kc0rRHZ5`7kJm*~$XpPGWkT}auB=J3|H)}0GlK);N`^d3
zG(I9<@*IfJn-h1NiIst2*};3?!ciSM{Lz-MR>zxqn=3TP<87pkfkLrB)o!azi;l-B
z^HNiF0bZIC3h8q;d`1^Muu6`Qmhg6;g$${+MBlgA(SGl?jgW7zzFTX#6XM+3J<der
z$qR=6l|PX3|FBaB>W2D{y`K%q{rCT+nE$;Y#;2N(2~{4HaASXgPeFRX<fN6roZAn*
F{udCec-#O0

literal 0
HcmV?d00001

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 10fe61e4..dfd5e2c4 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,23 +1,29 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 add_subdirectory(unit)
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index c3edf277..e4879229 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -1,23 +1,29 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 include(CTest)
diff --git a/test/unit/common/cutlass_unit_test.h b/test/unit/common/cutlass_unit_test.h
index 8b8e1927..a9bf318c 100644
--- a/test/unit/common/cutlass_unit_test.h
+++ b/test/unit/common/cutlass_unit_test.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/common/filter_architecture.cpp b/test/unit/common/filter_architecture.cpp
index 73c2f934..4f024319 100644
--- a/test/unit/common/filter_architecture.cpp
+++ b/test/unit/common/filter_architecture.cpp
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/conv/CMakeLists.txt b/test/unit/conv/CMakeLists.txt
index 243143f9..798438b6 100644
--- a/test/unit/conv/CMakeLists.txt
+++ b/test/unit/conv/CMakeLists.txt
@@ -1,23 +1,29 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 add_custom_target(cutlass_test_unit_conv)
diff --git a/test/unit/conv/device/CMakeLists.txt b/test/unit/conv/device/CMakeLists.txt
index 13805670..a9bdbd9a 100644
--- a/test/unit/conv/device/CMakeLists.txt
+++ b/test/unit/conv/device/CMakeLists.txt
@@ -1,23 +1,29 @@
-# Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  
 list(SORT CUTLASS_NVCC_ARCHS_ENABLED)
@@ -128,8 +134,7 @@ endif()
 # Conv - F16 input, F32 output, F32 accumulation
 cutlass_test_unit_add_executable(
   cutlass_test_unit_conv_device_tensorop_f32_sm70
-  
-  conv2d_fprop_with_broadcast_sm70.cu
+  conv2d_fprop_with_broadcast_sm70.cu 
   conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu
   conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu
   conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu
@@ -168,6 +173,10 @@ if (CUTLASS_NVCC_MAX_ARCH GREATER_EQUAL 80)
     conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
     conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
     conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
+
+    # Conv2d (small channel count specializations)
+    conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
+    conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
   
     # Conv2d (Strided Dgrad)
     conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
diff --git a/test/unit/conv/device/cache_testbed_output.h b/test/unit/conv/device/cache_testbed_output.h
index aea7ac9f..299baabe 100644
--- a/test/unit/conv/device/cache_testbed_output.h
+++ b/test/unit/conv/device/cache_testbed_output.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *   * Redistributions of source code must retain the above copyright notice, this list of
- *     conditions and the following disclaimer.
- *   * Redistributions in binary form must reproduce the above copyright notice, this list of
- *     conditions and the following disclaimer in the documentation and/or other materials
- *     provided with the distribution.
- *   * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *     to endorse or promote products derived from this software without specific prior written
- *     permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu
index 97f8609f..e1c4ddc8 100644
--- a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu
+++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu
index 3f225c1d..6736361f 100644
--- a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
index 2af98674..b7e9b94b 100644
--- a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
+++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu
index b8847eb8..894cb78f 100644
--- a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu
+++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu
index 8dfb2d5c..9973cdd1 100644
--- a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu
+++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
index 0693a874..141772e4 100644
--- a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
index a808bd28..34c2bfa4 100644
--- a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
index 156856c5..54bae937 100644
--- a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/conv/device/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
new file mode 100644
index 00000000..6e467b85
--- /dev/null
+++ b/test/unit/conv/device/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
@@ -0,0 +1,303 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Implicit GEMM interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "conv2d_testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+std::vector<cutlass::conv::Conv2dProblemSize> Conv2dFewChannelProblemSizes(int channels) {
+
+  std::vector<cutlass::conv::Conv2dProblemSize> problems;
+
+  problems.push_back(cutlass::conv::Conv2dProblemSize(
+    {1, 8, 8, channels},   // input size  (NHWC)
+    {16, 3, 3, channels},   // filter size (KRSC)
+    {1, 1, 1, 1},                      // padding (pad_h, _, pad_w, _)
+    {2, 2},                            // stride (stride_h, stride_w)
+    {1, 1}                             // dilation (dilation_h, dilation_w)
+  ));
+
+  problems.push_back(cutlass::conv::Conv2dProblemSize(
+    {1, 16, 16, channels},   // input size  (NHWC)
+    {16, 3, 3, channels},   // filter size (KRSC)
+    {1, 1, 1, 1},                      // padding (pad_h, _, pad_w, _)
+    {2, 2},                            // stride (stride_h, stride_w)
+    {1, 1}                             // dilation (dilation_h, dilation_w)
+  ));
+
+  problems.push_back(cutlass::conv::Conv2dProblemSize(
+    {1, 16, 16, channels},   // input size  (NHWC)
+    {16, 7, 7, channels},   // filter size (KRSC)
+    {1, 1, 1, 1},                      // padding (pad_h, _, pad_w, _)
+    {1, 1},                            // stride (stride_h, stride_w)
+    {1, 1}                             // dilation (dilation_h, dilation_w)
+  ));
+
+  problems.push_back(cutlass::conv::Conv2dProblemSize(
+    {1, 224, 224, channels},   // input size  (NHWC)
+    {32, 7, 7, channels},   // filter size (KRSC)
+    {1, 1, 1, 1},                      // padding (pad_h, _, pad_w, _)
+    {1, 1},                            // stride (stride_h, stride_w)
+    {1, 1}                             // dilation (dilation_h, dilation_w)
+  ));
+
+  problems.push_back(cutlass::conv::Conv2dProblemSize(
+    {1, 224, 224, channels},   // input size  (NHWC)
+    {64, 7, 7, channels},   // filter size (KRSC)
+    {1, 1, 1, 1},                      // padding (pad_h, _, pad_w, _)
+    {2, 2},                            // stride (stride_h, stride_w)
+    {1, 1}                             // dilation (dilation_h, dilation_w)
+  ));
+
+  problems.push_back(cutlass::conv::Conv2dProblemSize(
+    {1, 224, 224, channels},   // input size  (NHWC)
+    {64, 5, 5, channels},   // filter size (KRSC)
+    {1, 1, 1, 1},                      // padding (pad_h, _, pad_w, _)
+    {1, 1},                            // stride (stride_h, stride_w)
+    {1, 1}                             // dilation (dilation_h, dilation_w)
+  ));
+
+  problems.push_back(cutlass::conv::Conv2dProblemSize(
+    {1, 224, 224, channels},   // input size  (NHWC)
+    {64, 5, 5, channels},   // filter size (KRSC)
+    {1, 1, 1, 1},                      // padding (pad_h, _, pad_w, _)
+    {2, 2},                            // stride (stride_h, stride_w)
+    {1, 1}                             // dilation (dilation_h, dilation_w)
+  ));
+
+  return problems;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+#if 0
+TEST(SM80_Device_Conv2d_Fprop_Few_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_8,
+  128x128_64x3_64x64x64) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = cutlass::half_t;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+  int const kChannelCount = 8;
+
+  /// Device-level Conv2d instance
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kFewChannels,
+    cutlass::conv::StrideSupport::kStrided,
+    kChannelCount,
+    kChannelCount
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestSpecificConv2d<Conv2dFprop>(
+    Conv2dFewChannelProblemSizes(kChannelCount)));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Conv2d_Fprop_Few_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_4,
+  128x128_64x3_64x64x64) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = cutlass::half_t;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+  int const kChannelCount = 4;
+
+  /// Device-level Conv2d instance
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kFewChannels,
+    cutlass::conv::StrideSupport::kStrided,
+    kChannelCount,
+    kChannelCount
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestSpecificConv2d<Conv2dFprop>(
+    Conv2dFewChannelProblemSizes(kChannelCount)));
+}
+#endif
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Conv2d_Fprop_Few_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_2,
+  128x128_64x3_64x64x64) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = cutlass::half_t;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+  int const kChannelCount = 2;
+
+  /// Device-level Conv2d instance
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kFewChannels,
+    cutlass::conv::StrideSupport::kStrided,
+    kChannelCount,
+    kChannelCount
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestSpecificConv2d<Conv2dFprop>(
+    Conv2dFewChannelProblemSizes(2 * kChannelCount)));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Conv2d_Fprop_Few_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_1,
+  128x128_64x3_64x64x64) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = cutlass::half_t;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+  int const kChannelCount = 1;
+
+  /// Device-level Conv2d instance
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kFewChannels,
+    cutlass::conv::StrideSupport::kStrided,
+    kChannelCount,
+    kChannelCount
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestSpecificConv2d<Conv2dFprop>(
+    Conv2dFewChannelProblemSizes(3 * kChannelCount)));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+#endif  // CUTLASS_ARCH_MMA_SM80_SUPPORTED
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/conv/device/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
new file mode 100644
index 00000000..eedbe050
--- /dev/null
+++ b/test/unit/conv/device/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
@@ -0,0 +1,240 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Implicit GEMM interface
+*/
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "conv2d_testbed.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+std::vector<cutlass::conv::Conv2dProblemSize> Conv2dFixedChannelProblemSizes(int channels) {
+
+  std::vector<cutlass::conv::Conv2dProblemSize> problems;
+
+  problems.push_back(cutlass::conv::Conv2dProblemSize(
+    {1, 8, 8, channels},   // input size  (NHWC)
+    {16, 3, 3, channels},   // filter size (KRSC)
+    {1, 1, 1, 1},                      // padding (pad_h, _, pad_w, _)
+    {2, 2},                            // stride (stride_h, stride_w)
+    {1, 1}                             // dilation (dilation_h, dilation_w)
+  ));
+
+  problems.push_back(cutlass::conv::Conv2dProblemSize(
+    {1, 224, 224, channels},   // input size  (NHWC)
+    {32, 7, 7, channels},   // filter size (KRSC)
+    {1, 1, 1, 1},                      // padding (pad_h, _, pad_w, _)
+    {1, 1},                            // stride (stride_h, stride_w)
+    {1, 1}                             // dilation (dilation_h, dilation_w)
+  ));
+
+  problems.push_back(cutlass::conv::Conv2dProblemSize(
+    {1, 224, 224, channels},   // input size  (NHWC)
+    {64, 7, 7, channels},   // filter size (KRSC)
+    {1, 1, 1, 1},                      // padding (pad_h, _, pad_w, _)
+    {2, 2},                            // stride (stride_h, stride_w)
+    {1, 1}                             // dilation (dilation_h, dilation_w)
+  ));
+
+  problems.push_back(cutlass::conv::Conv2dProblemSize(
+    {1, 224, 224, channels},   // input size  (NHWC)
+    {64, 5, 5, channels},   // filter size (KRSC)
+    {1, 1, 1, 1},                      // padding (pad_h, _, pad_w, _)
+    {1, 1},                            // stride (stride_h, stride_w)
+    {1, 1}                             // dilation (dilation_h, dilation_w)
+  ));
+
+  problems.push_back(cutlass::conv::Conv2dProblemSize(
+    {1, 224, 224, channels},   // input size  (NHWC)
+    {64, 5, 5, channels},   // filter size (KRSC)
+    {1, 1, 1, 1},                      // padding (pad_h, _, pad_w, _)
+    {2, 2},                            // stride (stride_h, stride_w)
+    {1, 1}                             // dilation (dilation_h, dilation_w)
+  ));
+
+  return problems;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Conv2d_Fprop_Fixed_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_8,
+  128x128_64x3_64x64x64) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = cutlass::half_t;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+  int const kChannelCount = 8;
+
+  /// Device-level Conv2d instance
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kFixedChannels,
+    cutlass::conv::StrideSupport::kStrided,
+    kChannelCount,
+    kChannelCount
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>(
+    Conv2dFixedChannelProblemSizes(kChannelCount)));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Conv2d_Fprop_Fixed_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_4,
+  128x128_64x3_64x64x64) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = cutlass::half_t;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+  int const kChannelCount = 4;
+
+  /// Device-level Conv2d instance
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kFixedChannels,
+    cutlass::conv::StrideSupport::kStrided,
+    kChannelCount,
+    kChannelCount
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>(
+    Conv2dFixedChannelProblemSizes(kChannelCount)));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Conv2d_Fprop_Fixed_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_2,
+  128x128_64x3_64x64x64) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::half_t;
+  using ElementB           = cutlass::half_t;
+  using ElementC           = cutlass::half_t;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+  int const kChannelCount = 2;
+
+  /// Device-level Conv2d instance
+  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 64>,
+    cutlass::gemm::GemmShape<64, 64, 64>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      128 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kFixedChannels,
+    cutlass::conv::StrideSupport::kStrided,
+    kChannelCount,
+    kChannelCount
+  >::Kernel;
+
+  using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>(
+    Conv2dFixedChannelProblemSizes(kChannelCount)));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+#endif  // CUTLASS_ARCH_MMA_SM80_SUPPORTED
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu
index d2810a09..e56fcde8 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu
index b6d58ba3..17bd4246 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu
index bf76a9dd..a68132a7 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
index 579157e7..708ac29a 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
index 78b5ffc2..ff07a045 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -120,5 +126,7 @@ TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_ten
 }
 
 ////////////////////////////////////////////////////////////////////////////////
+
 #endif  // CUTLASS_ARCH_MMA_SM80_SUPPORTED
 
+////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu
index e2e4ece9..10cf920c 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -76,7 +82,6 @@ TEST(SM70_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tens
   EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
 }
 
-
 ////////////////////////////////////////////////////////////////////////////////
 
 TEST(SM70_Device_Conv2d_Fprop_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32,
@@ -118,7 +123,5 @@ TEST(SM70_Device_Conv2d_Fprop_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_ten
   EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dFprop>());
 }
 
-
-////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////
 #endif  // CUTLASS_ARCH_MMA_SM70_SUPPORTED
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu
index c5710dc0..4080029e 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
index 5b1c83f5..22aab626 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm50.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm50.cu
index ec11ac28..521d5613 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm50.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm50.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
index 34ba4002..68a0a682 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_qf32nhwc_qf32nhwc_qf32nhwc_simt_f32_sm50.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_qf32nhwc_qf32nhwc_qf32nhwc_simt_f32_sm50.cu
index ce710393..213a2566 100755
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_qf32nhwc_qf32nhwc_qf32nhwc_simt_f32_sm50.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_qf32nhwc_qf32nhwc_qf32nhwc_simt_f32_sm50.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm75.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm75.cu
index 6373c8c7..67bfc35e 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm75.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm75.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm80.cu
index b170b7d3..7393a9f0 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm80.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm75.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm75.cu
index 223fc394..97de9950 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm75.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm75.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -59,7 +65,7 @@ TEST(SM75_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s4nhwc_s4nhwc_s32nhwc_tensor
     cutlass::gemm::GemmShape<8, 8, 32>,
     cutlass::epilogue::thread::LinearCombination<
       ElementC,
-      64 / cutlass::sizeof_bits<ElementC>::value,
+      128 / cutlass::sizeof_bits<ElementC>::value,
       ElementAccumulator,
       ElementCompute
     >,
@@ -99,7 +105,7 @@ TEST(SM75_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s4nhwc_s4nhwc_s32nhwc_tenso
     cutlass::gemm::GemmShape<8, 8, 32>,
     cutlass::epilogue::thread::LinearCombination<
       ElementC,
-      64 / cutlass::sizeof_bits<ElementC>::value,
+      128 / cutlass::sizeof_bits<ElementC>::value,
       ElementAccumulator,
       ElementCompute
     >,
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm80.cu
index 5c9e2052..9487a3f1 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm80.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -61,7 +67,7 @@ TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s4nhwc_s4nhwc_s32nhwc_tensor
     cutlass::gemm::GemmShape<16, 8, 64>,
     cutlass::epilogue::thread::LinearCombination<
       ElementC,
-      64 / cutlass::sizeof_bits<ElementC>::value,
+      128 / cutlass::sizeof_bits<ElementC>::value,
       ElementAccumulator,
       ElementCompute
     >,
@@ -101,7 +107,7 @@ TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s4nhwc_s4nhwc_s32nhwc_tenso
     cutlass::gemm::GemmShape<16, 8, 64>,
     cutlass::epilogue::thread::LinearCombination<
       ElementC,
-      64 / cutlass::sizeof_bits<ElementC>::value,
+      128 / cutlass::sizeof_bits<ElementC>::value,
       ElementAccumulator,
       ElementCompute
     >,
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.cu
index c336ecf0..d55ad1bf 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm80.cu
index ba95d696..117d5096 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm80.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm75.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm75.cu
index dfbba477..1187ff75 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm75.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm75.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -59,7 +65,7 @@ TEST(SM75_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s8nhwc_s8nhwc_s32nhwc_tensor
     cutlass::gemm::GemmShape<8, 8, 16>,
     cutlass::epilogue::thread::LinearCombination<
       ElementC,
-      64 / cutlass::sizeof_bits<ElementC>::value,
+      128 / cutlass::sizeof_bits<ElementC>::value,
       ElementAccumulator,
       ElementCompute
     >,
@@ -99,7 +105,7 @@ TEST(SM75_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s8nhwc_s8nhwc_s32nhwc_tenso
     cutlass::gemm::GemmShape<8, 8, 16>,
     cutlass::epilogue::thread::LinearCombination<
       ElementC,
-      64 / cutlass::sizeof_bits<ElementC>::value,
+      128 / cutlass::sizeof_bits<ElementC>::value,
       ElementAccumulator,
       ElementCompute
     >,
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm80.cu
index 7ce105f1..066045b4 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm80.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -60,7 +66,7 @@ TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s8nhwc_s8nhwc_s32nhwc_tensor
     cutlass::gemm::GemmShape<16, 8, 32>,
     cutlass::epilogue::thread::LinearCombination<
       ElementC,
-      64 / cutlass::sizeof_bits<ElementC>::value,
+      128 / cutlass::sizeof_bits<ElementC>::value,
       ElementAccumulator,
       ElementCompute
     >,
@@ -100,7 +106,7 @@ TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s8nhwc_s8nhwc_s32nhwc_tenso
     cutlass::gemm::GemmShape<16, 8, 32>,
     cutlass::epilogue::thread::LinearCombination<
       ElementC,
-      64 / cutlass::sizeof_bits<ElementC>::value,
+      128 / cutlass::sizeof_bits<ElementC>::value,
       ElementAccumulator,
       ElementCompute
     >,
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
index 090c4244..57826212 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/conv/device/conv2d_fprop_with_broadcast_sm70.cu b/test/unit/conv/device/conv2d_fprop_with_broadcast_sm70.cu
index 511b331c..18398e25 100644
--- a/test/unit/conv/device/conv2d_fprop_with_broadcast_sm70.cu
+++ b/test/unit/conv/device/conv2d_fprop_with_broadcast_sm70.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2022, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -52,7 +58,7 @@ template <
  template<typename T> class ActivationOp,
  template<typename T> class BinaryOp,
  template<typename T> class UnaryOp,
- bool TestSplitK = false
+ bool TestSplitK = false 
 >
 void TestResidaulBlock() {
   using ElementA = cutlass::half_t;
diff --git a/test/unit/conv/device/conv2d_fprop_with_broadcast_sm75.cu b/test/unit/conv/device/conv2d_fprop_with_broadcast_sm75.cu
index 2d3505c6..88cc4973 100644
--- a/test/unit/conv/device/conv2d_fprop_with_broadcast_sm75.cu
+++ b/test/unit/conv/device/conv2d_fprop_with_broadcast_sm75.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -32,7 +38,6 @@
 #include "cutlass/epilogue/thread/linear_combination_bias_elementwise.h"
 #include "cutlass/epilogue/thread/linear_combination_residual_block.h"
 #include "cutlass/epilogue/thread/activation.h"
-
 #include "cutlass/conv/kernel/default_conv2d_fprop_with_broadcast.h"
 #include "cutlass/conv/device/implicit_gemm_convolution.h"
 
diff --git a/test/unit/conv/device/conv2d_fprop_with_reduction_sm75.cu b/test/unit/conv/device/conv2d_fprop_with_reduction_sm75.cu
index 025eb10c..f096d2db 100644
--- a/test/unit/conv/device/conv2d_fprop_with_reduction_sm75.cu
+++ b/test/unit/conv/device/conv2d_fprop_with_reduction_sm75.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/conv/device/conv2d_problems.h b/test/unit/conv/device/conv2d_problems.h
index 651d5bed..b740161e 100644
--- a/test/unit/conv/device/conv2d_problems.h
+++ b/test/unit/conv/device/conv2d_problems.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -486,7 +492,6 @@ struct TestbedConv2dProblemSizes {
       {1, 1},             // dilation (dilation_h, dilation_w)
       {4, 1, 1, 328}      // output size (NPQK)
     ));
-    
   }
 
 
diff --git a/test/unit/conv/device/conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
index 614a3cef..748f83ed 100644
--- a/test/unit/conv/device/conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/conv/device/conv2d_testbed.h b/test/unit/conv/device/conv2d_testbed.h
index 1f441c23..125c177a 100644
--- a/test/unit/conv/device/conv2d_testbed.h
+++ b/test/unit/conv/device/conv2d_testbed.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -281,6 +287,7 @@ class TestbedConv2d {
     
     EXPECT_TRUE(status == cutlass::Status::kSuccess);
     if (status != cutlass::Status::kSuccess) {
+      std::cerr << "Failed to run." << std::endl;
       return false;
     }
 
@@ -452,33 +459,36 @@ class TestbedConv2d {
 
     EXPECT_TRUE(passed);
 
-    if (!passed) {
-      std::stringstream fname;
-
-      fname << "error_Conv2d_ImplicitGemm_device_"
-        << (split_k_mode == cutlass::conv::SplitKMode::kSerial ? "serial_reduction_" : "parallel_reduction_")
-        << (Conv2d::kConvolutionalOperator == cutlass::conv::Operator::kFprop ? "fprop_" :
-            (Conv2d::kConvolutionalOperator == cutlass::conv::Operator::kDgrad ? "dgrad_" : "wgrad_")) 
-        << "nhwc_"
+    std::stringstream ss_problem_size_text;
+    ss_problem_size_text         << "nhwc_"
         << problem_size.N << "x"
         << problem_size.H << "x"
         << problem_size.W << "x"
-        << problem_size.C 
+        << problem_size.C
         << "_krsc_"
         << problem_size.K << "x"
         << problem_size.R << "x"
         << problem_size.S << "x"
-        << problem_size.C 
-        << "_padding_" 
+        << problem_size.C
+        << "_padding_"
         << problem_size.pad_h << "x"
-        << problem_size.pad_w 
-        << "_stride_"  
+        << problem_size.pad_w
+        << "_stride_"
         << problem_size.stride_h << "x"
-        << problem_size.stride_w 
+        << problem_size.stride_w
         << "_dilation_"
         << problem_size.dilation_h << "x"
         << problem_size.dilation_w << "_"
-        << (problem_size.mode == cutlass::conv::Mode::kCrossCorrelation ? "xcorr_" : "conv_")
+        << (problem_size.mode == cutlass::conv::Mode::kCrossCorrelation ? "xcorr_" : "conv_");
+
+    if (!passed) {
+      std::stringstream fname;
+
+      fname << "error_Conv2d_ImplicitGemm_device_"
+        << (split_k_mode == cutlass::conv::SplitKMode::kSerial ? "serial_reduction_" : "parallel_reduction_")
+        << (Conv2d::kConvolutionalOperator == cutlass::conv::Operator::kFprop ? "fprop_" :
+            (Conv2d::kConvolutionalOperator == cutlass::conv::Operator::kDgrad ? "dgrad_" : "wgrad_"))
+        << ss_problem_size_text.str()
         << Conv2d::ThreadblockShape::kM << "x"  
         << Conv2d::ThreadblockShape::kN << "x"  
         << Conv2d::ThreadblockShape::kK << "_"
@@ -515,6 +525,49 @@ class TestbedConv2d {
 
 };
 
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename ImplicitGemm>
+bool TestSpecificConv2d(
+  const Conv2dProblemVector & problem_sizes) {
+
+  bool passed = true;
+
+  //
+  // Testbed object
+  //
+
+  TestbedConv2d<ImplicitGemm> testbed;
+
+  // Sweep conv2d problem sizes (split-k-mode=kSerial, split-k-slice=1, alpha=1.0, beta=0.0)
+  for(auto conv_problem : problem_sizes) {
+
+    //
+    // Test
+    //
+
+    // test mode = xcross
+    passed = testbed.run(
+      conv_problem,
+      cutlass::conv::SplitKMode::kSerial);
+
+    if (!passed) {
+      return false;
+    }
+
+    // test mode = convolution
+    passed = testbed.run(
+      conv_problem.reset_mode(cutlass::conv::Mode::kConvolution),
+      cutlass::conv::SplitKMode::kSerial);
+
+    if (!passed) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
 // TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference
 // TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes
@@ -589,6 +642,22 @@ bool TestAllConv2d(
       }
     }
 
+    // Fixed channels algorithm requires channel count to match access size
+    if (ImplicitGemm::ImplicitGemmKernel::Mma::IteratorA::kIteratorAlgorithm ==
+        cutlass::conv::IteratorAlgorithm::kFixedChannels) {
+      if (conv_problem.C != ImplicitGemm::ImplicitGemmKernel::Mma::IteratorA::AccessType::kElements) {
+        continue;
+      }
+    }
+
+    // Few channels algorithm requires channel count to match access size
+    if (ImplicitGemm::ImplicitGemmKernel::Mma::IteratorA::kIteratorAlgorithm ==
+        cutlass::conv::IteratorAlgorithm::kFewChannels) {
+      if (conv_problem.C % ImplicitGemm::ImplicitGemmKernel::Mma::IteratorA::AccessType::kElements) {
+        continue;
+      }
+    }
+
     // CUTLASS DGRAD's *strided* stride specialization supports all stride {stride_h, stride_w} 
     // Although strided dgrad works for all stride combinations, we are only going 
     // to run strided dgrad for non-unity strides 
@@ -615,7 +684,7 @@ bool TestAllConv2d(
     if (!passed) {
       return false;
     }
-    
+
     // test mode = convolution
     passed = testbed.run(
       conv_problem.reset_mode(cutlass::conv::Mode::kConvolution),
@@ -631,8 +700,21 @@ bool TestAllConv2d(
       return true;
     }
   }
-  
 
+  // Small-channels convolution can't run here.
+  if (ImplicitGemm::ImplicitGemmKernel::Mma::IteratorA::kIteratorAlgorithm ==
+        cutlass::conv::IteratorAlgorithm::kFixedChannels) {
+
+    return true;
+  }
+
+  // Small-channels convolution can't run here.
+  if (ImplicitGemm::ImplicitGemmKernel::Mma::IteratorA::kIteratorAlgorithm ==
+        cutlass::conv::IteratorAlgorithm::kFewChannels) {
+
+    return true;
+  }
+  
   // CUTLASS DGRAD's *strided* specialization does not support split-k mode 
   if ((ImplicitGemm::kConvolutionalOperator == 
           cutlass::conv::Operator::kDgrad) && 
@@ -656,7 +738,6 @@ bool TestAllConv2d(
 
     return passed;
   }
-
   // Sweep split-k-slice using serial and prallel reduction with non-unity alpha and non-zero beta for 
   // a single conv2d problem size. Convolution unit tests take a long time to run so only sweep parameters 
   // which are abolutely neccessary to catch functional bugs. The below code does provide option to sweep 
@@ -719,3 +800,5 @@ bool TestAllConv2d(
 } // namespace device
 } // namespace conv
 } // namespace test
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/conv/device/conv2d_testbed_interleaved.h b/test/unit/conv/device/conv2d_testbed_interleaved.h
index 86e4278c..db271999 100644
--- a/test/unit/conv/device/conv2d_testbed_interleaved.h
+++ b/test/unit/conv/device/conv2d_testbed_interleaved.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu
index 27862802..fa51fc04 100644
--- a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu
+++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu
index 0b9a6193..0db46a0a 100644
--- a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
index 51c4f8d5..4249c6a2 100644
--- a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
+++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu
index 4415a0df..73f0b628 100644
--- a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu
+++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu
index 9783f94b..97fda6a7 100644
--- a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu
+++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
index 3180ffae..84b0a2ba 100644
--- a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
index 54d4492e..541304e3 100644
--- a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
index e56506e3..49b550f2 100644
--- a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -38,7 +44,8 @@
 #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
 
 ////////////////////////////////////////////////////////////////////////////////
-TEST(SM80_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32,
+
+TEST(SM80_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32,
   128x128_32x3_64x64x32) {
 
   /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
@@ -76,6 +83,61 @@ TEST(SM80_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_te
   EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dWgrad>());
 }
 
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_align1,
+  128x128_32x3_64x64x32) {
+
+  /// Conv operation element types for the Gemm equivalent (ImplicitGemm)
+  using ElementA           = cutlass::tfloat32_t;
+  using ElementB           = cutlass::tfloat32_t;
+  using ElementC           = float;
+  using ElementAccumulator = float;
+  using ElementCompute     = float;
+
+  /// Device-level Conv2d instance
+  using Conv2dWgradKernel = typename cutlass::conv::kernel::DefaultConv2dWgrad<
+    ElementA, cutlass::layout::TensorNHWC,
+    ElementB, cutlass::layout::TensorNHWC,
+    ElementC, cutlass::layout::TensorNHWC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      32 / cutlass::sizeof_bits<ElementC>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    cutlass::arch::OpMultiplyAdd,
+    cutlass::conv::IteratorAlgorithm::kOptimized,
+    cutlass::conv::StrideSupport::kUnity,
+    1,
+    1    
+  >::Kernel;
+
+  using Conv2dWgrad = cutlass::conv::device::ImplicitGemmConvolution<Conv2dWgradKernel>;
+
+  test::conv::device::Conv2dProblemVector problem_size_list;
+
+  // run specific problem size in the unit test first
+  problem_size_list.push_back(cutlass::conv::Conv2dProblemSize(
+    {1, 8, 8, 1},  // input size (NHWC)
+    {1, 3, 3, 1},      // filter size (KRSC)
+    {1, 1, 1, 1},      // padding (pad_h, _, pad_w, _)
+    {1, 1},            // stride (stride_h, stride_w)
+    {1, 1}             // dilation (dilation_h, dilation_w)
+  ));
+
+  /// Run all unit test sizes with device-level Conv2d instance
+  EXPECT_TRUE(test::conv::device::TestAllConv2d<Conv2dWgrad>(problem_size_list));
+}
 
 ////////////////////////////////////////////////////////////////////////////////
+
 #endif  // CUTLASS_ARCH_MMA_SM80_SUPPORTED
diff --git a/test/unit/conv/device/conv2d_with_broadcast_testbed.h b/test/unit/conv/device/conv2d_with_broadcast_testbed.h
index eee8c162..1561ed8e 100644
--- a/test/unit/conv/device/conv2d_with_broadcast_testbed.h
+++ b/test/unit/conv/device/conv2d_with_broadcast_testbed.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -116,6 +122,7 @@ class TestbedConv2dWithBroadcast {
   static cutlass::conv::Operator const kConvolutionalOperator = Conv2d::kConvolutionalOperator;
   static const bool kAddBroadcastFirst = AddBroadcastFirst;
   static const bool kStoreT = EpilogueOutputOp::kStoreT;
+
 public:
 
   /// Initialization
@@ -340,8 +347,10 @@ class TestbedConv2dWithBroadcast {
     //
     // Reference check
     //
+
     // When kAddBroadcastFirst is true, add bias on the host
     ElementCompute beta_ref = kAddBroadcastFirst ? ElementCompute(0) : beta;
+
 #if CUTLASS_CONV_TEST_UNIT_REFERENCE_DEVICE_ENABLED
 
     cutlass::reference::device::Conv2d<
@@ -398,16 +407,18 @@ class TestbedConv2dWithBroadcast {
   
             ElementZ z;
             ElementT t;
-	    ElementCompute accum = tensor_Y_reference.at({n, p, q, k});
+    
+            ElementCompute accum = tensor_Y_reference.at({n, p, q, k});
 	    ElementCompute bias = ElementCompute(tensor_Broadcast.at({0, 0, 0, k}));
 
+
             if (kAddBroadcastFirst) {
               reference_op(z, t, accum + bias,
                            beta * ElementCompute(tensor_C_reference.at({n, p, q, k})));
             } else {
               reference_op(z, t, accum, bias);
-            }
-
+            }   
+ 
             tensor_Z_reference.at({n, p, q, k}) = z;
             tensor_T_reference.at({n, p, q, k}) = t;
           }
@@ -417,7 +428,9 @@ class TestbedConv2dWithBroadcast {
 
     if (kStoreT) {
       passed = cutlass::reference::host::TensorEquals(
-          tensor_T_computed.host_view(), tensor_T_reference.host_view());
+        tensor_T_computed.host_view(), 
+        tensor_T_reference.host_view());
+
       EXPECT_TRUE(passed);
     }
 
@@ -492,7 +505,8 @@ class TestbedConv2dWithBroadcast {
 template <typename ImplicitGemm,
           typename ReferenceOp = Conv2dWithBroadcastReferenceOp<ImplicitGemm>,
           bool AddBroadcastFirst = false,
-	  bool TestSplitK = true>
+          bool TestSplitK = true 
+>
 bool TestAllConv2dWithBroadcast(
   const Conv2dProblemVector &conv_test_sizes = Conv2dProblemVector(),
   const Conv2dProblemVector &conv_blacklist_sizes = Conv2dProblemVector()) {
diff --git a/test/unit/conv/device/conv2d_with_reduction_testbed.h b/test/unit/conv/device/conv2d_with_reduction_testbed.h
index 57b5d9ee..3b55a9ae 100644
--- a/test/unit/conv/device/conv2d_with_reduction_testbed.h
+++ b/test/unit/conv/device/conv2d_with_reduction_testbed.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/conv/device/conv3d_dgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv3d_dgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
index 8ab2fbb3..ddb79cf5 100644
--- a/test/unit/conv/device/conv3d_dgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv3d_dgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/conv/device/conv3d_dgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv3d_dgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
index d8620613..786a9479 100644
--- a/test/unit/conv/device/conv3d_dgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv3d_dgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/conv/device/conv3d_fprop_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm75.cu b/test/unit/conv/device/conv3d_fprop_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm75.cu
index 4ed1caf8..b0846db0 100644
--- a/test/unit/conv/device/conv3d_fprop_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm75.cu
+++ b/test/unit/conv/device/conv3d_fprop_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm75.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/conv/device/conv3d_fprop_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv3d_fprop_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
index 88704f7c..71938929 100644
--- a/test/unit/conv/device/conv3d_fprop_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv3d_fprop_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/conv/device/conv3d_fprop_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv3d_fprop_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
index 36f1f319..248eeca6 100644
--- a/test/unit/conv/device/conv3d_fprop_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv3d_fprop_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/conv/device/conv3d_problems.h b/test/unit/conv/device/conv3d_problems.h
index 9b80f9b9..2051cd4e 100644
--- a/test/unit/conv/device/conv3d_problems.h
+++ b/test/unit/conv/device/conv3d_problems.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/conv/device/conv3d_testbed.h b/test/unit/conv/device/conv3d_testbed.h
index 5139c4b7..1c511c19 100644
--- a/test/unit/conv/device/conv3d_testbed.h
+++ b/test/unit/conv/device/conv3d_testbed.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm75.cu b/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm75.cu
index 9fdb141e..7fd4ba2b 100644
--- a/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm75.cu
+++ b/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm75.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
index 39291552..09988ddd 100644
--- a/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/conv/device/conv3d_wgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv3d_wgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
index 6dfd2a07..960f6922 100644
--- a/test/unit/conv/device/conv3d_wgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv3d_wgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/core/CMakeLists.txt b/test/unit/core/CMakeLists.txt
index ee14d228..8a2ab83a 100644
--- a/test/unit/core/CMakeLists.txt
+++ b/test/unit/core/CMakeLists.txt
@@ -1,23 +1,29 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 cutlass_test_unit_add_executable(
diff --git a/test/unit/core/array.cu b/test/unit/core/array.cu
index ffec4d9e..b9f79c35 100644
--- a/test/unit/core/array.cu
+++ b/test/unit/core/array.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/core/bfloat16.cu b/test/unit/core/bfloat16.cu
index 67b5c0ee..f7fec543 100644
--- a/test/unit/core/bfloat16.cu
+++ b/test/unit/core/bfloat16.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/core/complex.cu b/test/unit/core/complex.cu
index e0f043a5..5bfddbe4 100644
--- a/test/unit/core/complex.cu
+++ b/test/unit/core/complex.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/core/functional.cu b/test/unit/core/functional.cu
index 9c2b1a35..8be00593 100644
--- a/test/unit/core/functional.cu
+++ b/test/unit/core/functional.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/core/half.cu b/test/unit/core/half.cu
index df68f9d1..66ef3ed7 100644
--- a/test/unit/core/half.cu
+++ b/test/unit/core/half.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/core/matrix.cu b/test/unit/core/matrix.cu
index b93999cc..5b1b9131 100644
--- a/test/unit/core/matrix.cu
+++ b/test/unit/core/matrix.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/core/matrix_coord.cu b/test/unit/core/matrix_coord.cu
index 7a81df77..28e124c5 100644
--- a/test/unit/core/matrix_coord.cu
+++ b/test/unit/core/matrix_coord.cu
@@ -1,25 +1,31 @@
 /***************************************************************************************************
-* Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
-*
-* Redistribution and use in source and binary forms, with or without modification, are permitted
-* provided that the following conditions are met:
-*     * Redistributions of source code must retain the above copyright notice, this list of
-*       conditions and the following disclaimer.
-*     * Redistributions in binary form must reproduce the above copyright notice, this list of
-*       conditions and the following disclaimer in the documentation and/or other materials
-*       provided with the distribution.
-*     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-*       to endorse or promote products derived from this software without specific prior written
-*       permission.
-*
-* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
 /*! \file
diff --git a/test/unit/core/numeric_conversion.cu b/test/unit/core/numeric_conversion.cu
index 724ec721..a3003e47 100644
--- a/test/unit/core/numeric_conversion.cu
+++ b/test/unit/core/numeric_conversion.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/core/predicate_vector.cu b/test/unit/core/predicate_vector.cu
index cf05864b..462c8457 100644
--- a/test/unit/core/predicate_vector.cu
+++ b/test/unit/core/predicate_vector.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/core/quaternion.cu b/test/unit/core/quaternion.cu
index 705a70b3..36106810 100644
--- a/test/unit/core/quaternion.cu
+++ b/test/unit/core/quaternion.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/core/tensor_ref.cu b/test/unit/core/tensor_ref.cu
index fac90440..95051d88 100644
--- a/test/unit/core/tensor_ref.cu
+++ b/test/unit/core/tensor_ref.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/core/tensor_view.cu b/test/unit/core/tensor_view.cu
index e3c881fa..ca1a7ed6 100644
--- a/test/unit/core/tensor_view.cu
+++ b/test/unit/core/tensor_view.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/core/test_unit_core.cpp b/test/unit/core/test_unit_core.cpp
index 619ff85a..b4bc3e8d 100644
--- a/test/unit/core/test_unit_core.cpp
+++ b/test/unit/core/test_unit_core.cpp
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/core/tfloat32.cu b/test/unit/core/tfloat32.cu
index df9788dc..072195d1 100644
--- a/test/unit/core/tfloat32.cu
+++ b/test/unit/core/tfloat32.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/data/hashes/cached_results_cutlass_test_unit_conv_device_tensorop_f32_sm80.txt b/test/unit/data/hashes/cached_results_cutlass_test_unit_conv_device_tensorop_f32_sm80.txt
index c4bf06c0..aa56222c 100644
--- a/test/unit/data/hashes/cached_results_cutlass_test_unit_conv_device_tensorop_f32_sm80.txt
+++ b/test/unit/data/hashes/cached_results_cutlass_test_unit_conv_device_tensorop_f32_sm80.txt
@@ -77,6 +77,111 @@ conv2d fprop_1x124x224x96_229x129_24x7x7_pad_h3w3_stride_h1w1_dil_h1w1_conv_alph
 conv2d fprop_1x233x35x48_233x35_24x7x5_pad_h3w2_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 835429033 3252222904 934864596 2610068735
 conv2d fprop_1x233x35x48_233x35_24x7x5_pad_h3w2_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 835429033 3252222904 934864596 4160630405
 conv2d fprop_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 83915378 2330495260 722450442 220865236
+conv2d fprop_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 83915378 2330495260 722450442 220865236
+conv2d fprop_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 83915378 2330495260 722450442 220865236
+conv2d fprop_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 83915378 2330495260 722450442 220865236
+conv2d fprop_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 83915378 2330495260 722450442 220865236
+conv2d fprop_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 83915378 2330495260 722450442 220865236
+conv2d fprop_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 83915378 2330495260 722450442 220865236
+conv2d fprop_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 83915378 2330495260 722450442 220865236
+conv2d fprop_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 83915378 2330495260 722450442 220865236
+conv2d fprop_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 83915378 2330495260 722450442 220865236
+conv2d dgrad_1x1x1x8_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 558763846 1152388039 1036082506 2004021933
+conv2d dgrad_1x1x1x8_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 558763846 1152388039 1036082506 2004021933
+conv2d dgrad_1x1x8x8_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2125023038 2411745115 1585402383 692064357
+conv2d dgrad_1x1x8x8_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2125023038 2411745115 1585402383 3191762167
+conv2d dgrad_1x7x8x8_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2793112873 4021016722 1911021900 2493803192
+conv2d dgrad_1x7x8x8_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2793112873 4021016722 1911021900 2915688271
+conv2d dgrad_1x7x9x8_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 390767101 3987447721 311726682 356690874
+conv2d dgrad_1x7x9x8_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 390767101 3987447721 311726682 3600277581
+conv2d dgrad_2x7x9x8_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2346452629 1986983586 1898068767 1480022038
+conv2d dgrad_2x7x9x8_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2346452629 1986983586 1898068767 1482170634
+conv2d dgrad_3x7x9x8_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 695385844 3844983048 3628298618 1684782062
+conv2d dgrad_3x7x9x8_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 695385844 3844983048 3628298618 1312771437
+conv2d dgrad_3x7x9x8_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 118714643 741820436 3628298618 3927854537
+conv2d dgrad_3x7x9x8_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 118714643 741820436 3628298618 3753859236
+conv2d dgrad_3x7x9x8_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2078642274 2188679282 3628298618 331956491
+conv2d dgrad_3x7x9x8_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2078642274 2188679282 3628298618 3514953364
+conv2d dgrad_1x15x19x160_15x19_224x1x1_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1371340519 67170595 1307969125 1500995359
+conv2d dgrad_1x15x19x160_15x19_224x1x1_pad_h0w0_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1371340519 67170595 1307969125 1500995359
+conv2d dgrad_1x16x16x160_17x16_224x2x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2815270590 3903005020 1541598000 4224357357
+conv2d dgrad_1x16x16x160_17x16_224x2x3_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2815270590 3903005020 1541598000 1962390570
+conv2d dgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2371881721 764195688 2813830197 3377586111
+conv2d dgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2371881721 764195688 2813830197 1315372741
+conv2d dgrad_1x29x37x160_29x37_224x5x5_pad_h2w2_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2314925659 2683853778 2983167436 324191835
+conv2d dgrad_1x29x37x160_29x37_224x5x5_pad_h2w2_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2314925659 2683853778 2983167436 1085373547
+conv2d dgrad_1x15x19x40_15x19_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1327011661 768805504 3823519277 1916712483
+conv2d dgrad_1x15x19x40_15x19_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1327011661 768805504 3823519277 1584393587
+conv2d dgrad_1x16x24x72_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2223985070 1195875134 2636428043 1173364286
+conv2d dgrad_1x16x24x72_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2223985070 1195875134 2636428043 4109782860
+conv2d dgrad_32x24x32x32_24x31_32x1x2_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2519843766 785253250 2351024902 3316943078
+conv2d dgrad_32x24x32x32_24x31_32x1x2_pad_h0w0_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2519843766 785253250 2351024902 1065622134
+conv2d dgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4232110637 417350231 2574042419 294608043
+conv2d dgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4232110637 417350231 2574042419 491435109
+conv2d dgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3630316758 2173099720 3438778114 3788164138
+conv2d dgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3630316758 2173099720 3438778114 2104210385
+conv2d dgrad_1x124x224x96_229x129_24x7x7_pad_h3w3_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 866609547 2416021807 2145192778 2640510765
+conv2d dgrad_1x124x224x96_229x129_24x7x7_pad_h3w3_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 866609547 2416021807 2145192778 2748647248
+conv2d dgrad_1x233x35x48_233x35_24x7x5_pad_h3w2_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2538496988 3252222904 535024815 169162784
+conv2d dgrad_1x233x35x48_233x35_24x7x5_pad_h3w2_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2538496988 3252222904 535024815 3841086721
+conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 2330495260 1682007569 215965178
+conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 2330495260 1682007569 215965178
+conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 2330495260 1682007569 215965178
+conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 2330495260 1682007569 215965178
+conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 2330495260 1682007569 215965178
+conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 2330495260 1682007569 215965178
+conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 2330495260 1682007569 215965178
+conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 2330495260 1682007569 215965178
+conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 2330495260 1682007569 215965178
+conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 2330495260 1682007569 215965178
+conv2d dgrad_1x1x1x8_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 558763846 1152388039 1036082506 2004021933
+conv2d dgrad_1x1x1x8_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 558763846 1152388039 1036082506 2004021933
+conv2d dgrad_1x1x8x8_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2125023038 2411745115 1585402383 692064357
+conv2d dgrad_1x1x8x8_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2125023038 2411745115 1585402383 3191762167
+conv2d dgrad_1x7x8x8_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2793112873 4021016722 1911021900 2493803192
+conv2d dgrad_1x7x8x8_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2793112873 4021016722 1911021900 2915688271
+conv2d dgrad_1x7x9x8_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 390767101 3987447721 311726682 356690874
+conv2d dgrad_1x7x9x8_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 390767101 3987447721 311726682 3600277581
+conv2d dgrad_2x7x9x8_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2346452629 1986983586 1898068767 1480022038
+conv2d dgrad_2x7x9x8_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2346452629 1986983586 1898068767 1482170634
+conv2d dgrad_3x7x9x8_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 695385844 3844983048 3628298618 1684782062
+conv2d dgrad_3x7x9x8_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 695385844 3844983048 3628298618 1312771437
+conv2d dgrad_3x7x9x8_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 118714643 741820436 3628298618 3927854537
+conv2d dgrad_3x7x9x8_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 118714643 741820436 3628298618 3753859236
+conv2d dgrad_3x7x9x8_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2078642274 2188679282 3628298618 331956491
+conv2d dgrad_3x7x9x8_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2078642274 2188679282 3628298618 3514953364
+conv2d dgrad_1x15x19x160_15x19_224x1x1_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1371340519 67170595 1307969125 1500995359
+conv2d dgrad_1x15x19x160_15x19_224x1x1_pad_h0w0_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1371340519 67170595 1307969125 1500995359
+conv2d dgrad_1x16x16x160_17x16_224x2x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2815270590 3903005020 1541598000 4224357357
+conv2d dgrad_1x16x16x160_17x16_224x2x3_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2815270590 3903005020 1541598000 1962390570
+conv2d dgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2371881721 764195688 2813830197 3377586111
+conv2d dgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2371881721 764195688 2813830197 1315372741
+conv2d dgrad_1x29x37x160_29x37_224x5x5_pad_h2w2_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2314925659 2683853778 2983167436 324191835
+conv2d dgrad_1x29x37x160_29x37_224x5x5_pad_h2w2_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2314925659 2683853778 2983167436 1085373547
+conv2d dgrad_1x15x19x40_15x19_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1327011661 768805504 3823519277 1916712483
+conv2d dgrad_1x15x19x40_15x19_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1327011661 768805504 3823519277 1584393587
+conv2d dgrad_1x16x24x72_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2223985070 1195875134 2636428043 1173364286
+conv2d dgrad_1x16x24x72_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2223985070 1195875134 2636428043 4109782860
+conv2d dgrad_32x24x32x32_24x31_32x1x2_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2519843766 785253250 2351024902 3316943078
+conv2d dgrad_32x24x32x32_24x31_32x1x2_pad_h0w0_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2519843766 785253250 2351024902 1065622134
+conv2d dgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4232110637 417350231 2574042419 294608043
+conv2d dgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4232110637 417350231 2574042419 491435109
+conv2d dgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3630316758 2173099720 3438778114 3788164138
+conv2d dgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3630316758 2173099720 3438778114 2104210385
+conv2d dgrad_1x124x224x96_229x129_24x7x7_pad_h3w3_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 866609547 2416021807 2145192778 2640510765
+conv2d dgrad_1x124x224x96_229x129_24x7x7_pad_h3w3_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 866609547 2416021807 2145192778 2748647248
+conv2d dgrad_1x233x35x48_233x35_24x7x5_pad_h3w2_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2538496988 3252222904 535024815 169162784
+conv2d dgrad_1x233x35x48_233x35_24x7x5_pad_h3w2_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2538496988 3252222904 535024815 3841086721
+conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 2330495260 1682007569 215965178
+conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 2330495260 1682007569 215965178
+conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 2330495260 1682007569 215965178
+conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 2330495260 1682007569 215965178
+conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 2330495260 1682007569 215965178
+conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 2330495260 1682007569 215965178
+conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 2330495260 1682007569 215965178
+conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 2330495260 1682007569 215965178
+conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 2330495260 1682007569 215965178
+conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 2330495260 1682007569 215965178
 conv2d dgrad_1x1x1x8_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 558763846 1152388039 1036082506 2004021933
 conv2d dgrad_1x1x1x8_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 558763846 1152388039 1036082506 2004021933
 conv2d dgrad_1x1x8x8_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2125023038 2411745115 1585402383 692064357
@@ -116,6 +221,419 @@ conv2d dgrad_1x124x224x96_229x129_24x7x7_pad_h3w3_stride_h1w1_dil_h1w1_conv_alph
 conv2d dgrad_1x233x35x48_233x35_24x7x5_pad_h3w2_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2538496988 3252222904 535024815 169162784
 conv2d dgrad_1x233x35x48_233x35_24x7x5_pad_h3w2_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2538496988 3252222904 535024815 3841086721
 conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 2330495260 1682007569 215965178
+conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 2330495260 1682007569 215965178
+conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 2330495260 1682007569 215965178
+conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 2330495260 1682007569 215965178
+conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 2330495260 1682007569 215965178
+conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 2330495260 1682007569 215965178
+conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 2330495260 1682007569 215965178
+conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 2330495260 1682007569 215965178
+conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 2330495260 1682007569 215965178
+conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 2330495260 1682007569 215965178
+conv2d dgrad_1x1x1x8_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 558763846 1152388039 1036082506 2004021933
+conv2d dgrad_1x1x1x8_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 558763846 1152388039 1036082506 2004021933
+conv2d dgrad_1x1x8x8_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2125023038 2411745115 1585402383 692064357
+conv2d dgrad_1x1x8x8_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2125023038 2411745115 1585402383 3191762167
+conv2d dgrad_1x7x8x8_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2793112873 4021016722 1911021900 2493803192
+conv2d dgrad_1x7x8x8_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2793112873 4021016722 1911021900 2915688271
+conv2d dgrad_1x7x9x8_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 390767101 3987447721 311726682 356690874
+conv2d dgrad_1x7x9x8_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 390767101 3987447721 311726682 3600277581
+conv2d dgrad_2x7x9x8_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2346452629 1986983586 1898068767 1480022038
+conv2d dgrad_2x7x9x8_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2346452629 1986983586 1898068767 1482170634
+conv2d dgrad_3x7x9x8_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 695385844 3844983048 3628298618 1684782062
+conv2d dgrad_3x7x9x8_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 695385844 3844983048 3628298618 1312771437
+conv2d dgrad_3x7x9x8_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 118714643 741820436 3628298618 3927854537
+conv2d dgrad_3x7x9x8_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 118714643 741820436 3628298618 3753859236
+conv2d dgrad_3x7x9x8_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2078642274 2188679282 3628298618 331956491
+conv2d dgrad_3x7x9x8_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2078642274 2188679282 3628298618 3514953364
+conv2d dgrad_1x15x19x160_15x19_224x1x1_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1371340519 67170595 1307969125 1500995359
+conv2d dgrad_1x15x19x160_15x19_224x1x1_pad_h0w0_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1371340519 67170595 1307969125 1500995359
+conv2d dgrad_1x16x16x160_17x16_224x2x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2815270590 3903005020 1541598000 4224357357
+conv2d dgrad_1x16x16x160_17x16_224x2x3_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2815270590 3903005020 1541598000 1962390570
+conv2d dgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2371881721 764195688 2813830197 3377586111
+conv2d dgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2371881721 764195688 2813830197 1315372741
+conv2d dgrad_1x29x37x160_29x37_224x5x5_pad_h2w2_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2314925659 2683853778 2983167436 324191835
+conv2d dgrad_1x29x37x160_29x37_224x5x5_pad_h2w2_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2314925659 2683853778 2983167436 1085373547
+conv2d dgrad_1x15x19x40_15x19_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1327011661 768805504 3823519277 1916712483
+conv2d dgrad_1x15x19x40_15x19_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1327011661 768805504 3823519277 1584393587
+conv2d dgrad_1x16x24x72_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2223985070 1195875134 2636428043 1173364286
+conv2d dgrad_1x16x24x72_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2223985070 1195875134 2636428043 4109782860
+conv2d dgrad_32x24x32x32_24x31_32x1x2_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2519843766 785253250 2351024902 3316943078
+conv2d dgrad_32x24x32x32_24x31_32x1x2_pad_h0w0_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2519843766 785253250 2351024902 1065622134
+conv2d dgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4232110637 417350231 2574042419 294608043
+conv2d dgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4232110637 417350231 2574042419 491435109
+conv2d dgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3630316758 2173099720 3438778114 3788164138
+conv2d dgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3630316758 2173099720 3438778114 2104210385
+conv2d dgrad_1x124x224x96_229x129_24x7x7_pad_h3w3_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 866609547 2416021807 2145192778 2640510765
+conv2d dgrad_1x124x224x96_229x129_24x7x7_pad_h3w3_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 866609547 2416021807 2145192778 2748647248
+conv2d dgrad_1x233x35x48_233x35_24x7x5_pad_h3w2_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2538496988 3252222904 535024815 169162784
+conv2d dgrad_1x233x35x48_233x35_24x7x5_pad_h3w2_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2538496988 3252222904 535024815 3841086721
+conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 2330495260 1682007569 215965178
+conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 2330495260 1682007569 215965178
+conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 2330495260 1682007569 215965178
+conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 2330495260 1682007569 215965178
+conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 2330495260 1682007569 215965178
+conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 2330495260 1682007569 215965178
+conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 2330495260 1682007569 215965178
+conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 2330495260 1682007569 215965178
+conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 2330495260 1682007569 215965178
+conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 2330495260 1682007569 215965178
+conv2d wgrad_1x1x1x8_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 558763846 3282100285 1585402383 820104967
+conv2d wgrad_1x1x1x8_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 558763846 3282100285 1585402383 820104967
+conv2d wgrad_1x1x8x8_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2125023038 1152388039 1919973277 4103626856
+conv2d wgrad_1x1x8x8_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2125023038 1152388039 1919973277 4164492975
+conv2d wgrad_1x7x8x8_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2793112873 2527774459 2695215306 1519678771
+conv2d wgrad_1x7x8x8_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2793112873 2527774459 2695215306 4056436284
+conv2d wgrad_1x7x9x8_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 390767101 3488761154 1737428004 2359527095
+conv2d wgrad_1x7x9x8_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 390767101 3488761154 1737428004 2240340004
+conv2d wgrad_2x7x9x8_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2346452629 2242491745 475190714 3363555017
+conv2d wgrad_2x7x9x8_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2346452629 2242491745 475190714 2240937477
+conv2d wgrad_3x7x9x8_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 695385844 1231166805 3030543492 1587793687
+conv2d wgrad_3x7x9x8_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 695385844 1231166805 3030543492 4129044530
+conv2d wgrad_3x7x9x8_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 118714643 1231166805 3002932066 1436246794
+conv2d wgrad_3x7x9x8_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 118714643 1231166805 3002932066 2860692730
+conv2d wgrad_3x7x9x8_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2078642274 1231166805 2704246666 3831504826
+conv2d wgrad_3x7x9x8_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2078642274 1231166805 2704246666 932276705
+conv2d wgrad_1x11x7x8_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2125023038 3952246738 1585402383 584217576
+conv2d wgrad_1x11x7x8_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2125023038 3952246738 1585402383 584217576
+conv2d wgrad_1x11x7x8_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2125023038 3952246738 2695215306 644274899
+conv2d wgrad_1x11x7x8_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2125023038 3952246738 2695215306 2070383610
+conv2d wgrad_1x13x11x8_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2793112873 3269716347 1585402383 2091838742
+conv2d wgrad_1x13x11x8_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2793112873 3269716347 1585402383 2091838742
+conv2d wgrad_1x17x19x8_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 624119942 2208297031 3036494114 1391639360
+conv2d wgrad_1x17x19x8_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 624119942 2208297031 3036494114 2853291206
+conv2d wgrad_1x23x5x8_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 118714643 3638907161 2381193183 4076043307
+conv2d wgrad_1x23x5x8_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 118714643 3638907161 2381193183 162466399
+conv2d wgrad_1x13x17x8_6x8_24x3x3_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3912955093 1748311260 3097125139 1760368998
+conv2d wgrad_1x13x17x8_6x8_24x3x3_pad_h0w0_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3912955093 1748311260 3097125139 370487040
+conv2d wgrad_1x23x21x8_8x7_24x3x3_pad_h1w1_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2380180646 950177617 3097125139 2441672462
+conv2d wgrad_1x23x21x8_8x7_24x3x3_pad_h1w1_stride_h3w3_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2380180646 950177617 3097125139 4093130692
+conv2d wgrad_1x20x24x8_8x10_40x3x3_pad_h3w3_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4197955097 3223936144 3268227320 3407045986
+conv2d wgrad_1x20x24x8_8x10_40x3x3_pad_h3w3_stride_h3w3_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4197955097 3223936144 3268227320 3327506975
+conv2d wgrad_1x15x19x160_15x19_224x1x1_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1371340519 2413111238 4020828414 3782469663
+conv2d wgrad_1x15x19x160_15x19_224x1x1_pad_h0w0_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1371340519 2413111238 4020828414 3782469663
+conv2d wgrad_1x19x37x160_10x19_224x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4064585156 1849062825 2621782130 1997835815
+conv2d wgrad_1x19x37x160_10x19_224x3x3_pad_h1w1_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4064585156 1849062825 2621782130 2176605786
+conv2d wgrad_1x16x16x160_17x16_224x2x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2815270590 1028019427 3062073018 1532225578
+conv2d wgrad_1x16x16x160_17x16_224x2x3_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2815270590 1028019427 3062073018 3175924537
+conv2d wgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2371881721 665408258 4119019906 3070068838
+conv2d wgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2371881721 665408258 4119019906 4095170522
+conv2d wgrad_1x29x37x160_29x37_224x5x5_pad_h2w2_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2314925659 1329921014 2730633195 1859011130
+conv2d wgrad_1x29x37x160_29x37_224x5x5_pad_h2w2_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2314925659 1329921014 2730633195 1829382848
+conv2d wgrad_1x15x19x40_15x19_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1327011661 654394569 180842421 456267566
+conv2d wgrad_1x15x19x40_15x19_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1327011661 654394569 180842421 4065629708
+conv2d wgrad_1x16x24x72_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2223985070 792561864 3949436648 2346117136
+conv2d wgrad_1x16x24x72_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2223985070 792561864 3949436648 3375703263
+conv2d wgrad_1x13x16x288_7x8_160x5x5_pad_h2w2_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1663038387 2182213461 3596813277 2680766255
+conv2d wgrad_1x13x16x288_7x8_160x5x5_pad_h2w2_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1663038387 2182213461 3596813277 819444109
+conv2d wgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2518156720 4284293314 3730143675 2330242371
+conv2d wgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2518156720 4284293314 3730143675 2330242371
+conv2d wgrad_1x71x80x32_36x40_64x5x5_pad_h2w2_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1153452126 1488124595 3162712336 2186928678
+conv2d wgrad_1x71x80x32_36x40_64x5x5_pad_h2w2_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1153452126 1488124595 3162712336 4038147074
+conv2d wgrad_1x224x224x8_112x112_64x7x7_pad_h3w3_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3557673531 1324698024 1813717058 1491574211
+conv2d wgrad_1x224x224x8_112x112_64x7x7_pad_h3w3_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3557673531 1324698024 1813717058 664642450
+conv2d wgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3373735934 1617738198 2081693980 1691976762
+conv2d wgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3373735934 1617738198 2081693980 1352928468
+conv2d wgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1883441731 169231440 2081693980 4154079864
+conv2d wgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1883441731 169231440 2081693980 49240552
+conv2d wgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2212544074 1111691860 1000406280 1708014479
+conv2d wgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2212544074 1111691860 1000406280 2976393539
+conv2d wgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 841696772 3782006390 2081693980 141423768
+conv2d wgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 841696772 3782006390 2081693980 1807012226
+conv2d wgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 841696772 3782006390 2081693980 4065587708
+conv2d wgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 841696772 3782006390 2081693980 3621154478
+conv2d wgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3869576724 4018387624 2099843274 3166971951
+conv2d wgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3869576724 4018387624 2099843274 710495809
+conv2d wgrad_1x32x32x16_6x16_32x3x3_pad_h1w1_stride_h6w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2106265431 1007447386 3357056235 3446932548
+conv2d wgrad_1x32x32x16_6x16_32x3x3_pad_h1w1_stride_h6w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2106265431 1007447386 3357056235 344178277
+conv2d wgrad_32x24x32x32_24x31_32x1x2_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2519843766 4040055257 852881505 1633246355
+conv2d wgrad_32x24x32x32_24x31_32x1x2_pad_h0w0_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2519843766 4040055257 852881505 778858244
+conv2d wgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4232110637 428926092 1217669626 1090782686
+conv2d wgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4232110637 428926092 1217669626 1523455590
+conv2d wgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3630316758 1787889194 2989357662 3134613422
+conv2d wgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3630316758 1787889194 2989357662 1158845444
+conv2d wgrad_1x124x224x96_229x129_24x7x7_pad_h3w3_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 866609547 857847029 3333097025 3239572160
+conv2d wgrad_1x124x224x96_229x129_24x7x7_pad_h3w3_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 866609547 857847029 3333097025 2436320376
+conv2d wgrad_1x233x35x48_233x35_24x7x5_pad_h3w2_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2538496988 3294874067 3443985888 4083335585
+conv2d wgrad_1x233x35x48_233x35_24x7x5_pad_h3w2_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2538496988 3294874067 3443985888 3302917564
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 531242570 422976060 2227466455
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 531242570 422976060 2227466455
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 531242570 422976060 2227466455
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 531242570 422976060 2227466455
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 531242570 422976060 2227466455
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 531242570 422976060 2227466455
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 531242570 422976060 2227466455
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 531242570 422976060 2227466455
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 531242570 422976060 2227466455
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 531242570 422976060 2227466455
+conv2d wgrad_1x1x1x8_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 558763846 3282100285 1585402383 820104967
+conv2d wgrad_1x1x1x8_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 558763846 3282100285 1585402383 820104967
+conv2d wgrad_1x1x8x8_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2125023038 1152388039 1919973277 4103626856
+conv2d wgrad_1x1x8x8_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2125023038 1152388039 1919973277 4164492975
+conv2d wgrad_1x7x8x8_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2793112873 2527774459 2695215306 1519678771
+conv2d wgrad_1x7x8x8_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2793112873 2527774459 2695215306 4056436284
+conv2d wgrad_1x7x9x8_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 390767101 3488761154 1737428004 2359527095
+conv2d wgrad_1x7x9x8_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 390767101 3488761154 1737428004 2240340004
+conv2d wgrad_2x7x9x8_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2346452629 2242491745 475190714 3363555017
+conv2d wgrad_2x7x9x8_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2346452629 2242491745 475190714 2240937477
+conv2d wgrad_3x7x9x8_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 695385844 1231166805 3030543492 1587793687
+conv2d wgrad_3x7x9x8_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 695385844 1231166805 3030543492 4129044530
+conv2d wgrad_3x7x9x8_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 118714643 1231166805 3002932066 1436246794
+conv2d wgrad_3x7x9x8_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 118714643 1231166805 3002932066 2860692730
+conv2d wgrad_3x7x9x8_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2078642274 1231166805 2704246666 3831504826
+conv2d wgrad_3x7x9x8_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2078642274 1231166805 2704246666 932276705
+conv2d wgrad_1x11x7x8_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2125023038 3952246738 1585402383 584217576
+conv2d wgrad_1x11x7x8_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2125023038 3952246738 1585402383 584217576
+conv2d wgrad_1x11x7x8_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2125023038 3952246738 2695215306 644274899
+conv2d wgrad_1x11x7x8_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2125023038 3952246738 2695215306 2070383610
+conv2d wgrad_1x13x11x8_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2793112873 3269716347 1585402383 2091838742
+conv2d wgrad_1x13x11x8_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2793112873 3269716347 1585402383 2091838742
+conv2d wgrad_1x17x19x8_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 624119942 2208297031 3036494114 1391639360
+conv2d wgrad_1x17x19x8_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 624119942 2208297031 3036494114 2853291206
+conv2d wgrad_1x23x5x8_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 118714643 3638907161 2381193183 4076043307
+conv2d wgrad_1x23x5x8_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 118714643 3638907161 2381193183 162466399
+conv2d wgrad_1x13x17x8_6x8_24x3x3_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3912955093 1748311260 3097125139 1760368998
+conv2d wgrad_1x13x17x8_6x8_24x3x3_pad_h0w0_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3912955093 1748311260 3097125139 370487040
+conv2d wgrad_1x23x21x8_8x7_24x3x3_pad_h1w1_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2380180646 950177617 3097125139 2441672462
+conv2d wgrad_1x23x21x8_8x7_24x3x3_pad_h1w1_stride_h3w3_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2380180646 950177617 3097125139 4093130692
+conv2d wgrad_1x20x24x8_8x10_40x3x3_pad_h3w3_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4197955097 3223936144 3268227320 3407045986
+conv2d wgrad_1x20x24x8_8x10_40x3x3_pad_h3w3_stride_h3w3_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4197955097 3223936144 3268227320 3327506975
+conv2d wgrad_1x15x19x160_15x19_224x1x1_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1371340519 2413111238 4020828414 3782469663
+conv2d wgrad_1x15x19x160_15x19_224x1x1_pad_h0w0_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1371340519 2413111238 4020828414 3782469663
+conv2d wgrad_1x19x37x160_10x19_224x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4064585156 1849062825 2621782130 1997835815
+conv2d wgrad_1x19x37x160_10x19_224x3x3_pad_h1w1_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4064585156 1849062825 2621782130 2176605786
+conv2d wgrad_1x16x16x160_17x16_224x2x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2815270590 1028019427 3062073018 1532225578
+conv2d wgrad_1x16x16x160_17x16_224x2x3_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2815270590 1028019427 3062073018 3175924537
+conv2d wgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2371881721 665408258 4119019906 3070068838
+conv2d wgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2371881721 665408258 4119019906 4095170522
+conv2d wgrad_1x29x37x160_29x37_224x5x5_pad_h2w2_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2314925659 1329921014 2730633195 1859011130
+conv2d wgrad_1x29x37x160_29x37_224x5x5_pad_h2w2_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2314925659 1329921014 2730633195 1829382848
+conv2d wgrad_1x15x19x40_15x19_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1327011661 654394569 180842421 456267566
+conv2d wgrad_1x15x19x40_15x19_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1327011661 654394569 180842421 4065629708
+conv2d wgrad_1x16x24x72_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2223985070 792561864 3949436648 2346117136
+conv2d wgrad_1x16x24x72_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2223985070 792561864 3949436648 3375703263
+conv2d wgrad_1x13x16x288_7x8_160x5x5_pad_h2w2_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1663038387 2182213461 3596813277 2680766255
+conv2d wgrad_1x13x16x288_7x8_160x5x5_pad_h2w2_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1663038387 2182213461 3596813277 819444109
+conv2d wgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2518156720 4284293314 3730143675 2330242371
+conv2d wgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2518156720 4284293314 3730143675 2330242371
+conv2d wgrad_1x71x80x32_36x40_64x5x5_pad_h2w2_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1153452126 1488124595 3162712336 2186928678
+conv2d wgrad_1x71x80x32_36x40_64x5x5_pad_h2w2_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1153452126 1488124595 3162712336 4038147074
+conv2d wgrad_1x224x224x8_112x112_64x7x7_pad_h3w3_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3557673531 1324698024 1813717058 1491574211
+conv2d wgrad_1x224x224x8_112x112_64x7x7_pad_h3w3_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3557673531 1324698024 1813717058 664642450
+conv2d wgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3373735934 1617738198 2081693980 1691976762
+conv2d wgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3373735934 1617738198 2081693980 1352928468
+conv2d wgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1883441731 169231440 2081693980 4154079864
+conv2d wgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1883441731 169231440 2081693980 49240552
+conv2d wgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2212544074 1111691860 1000406280 1708014479
+conv2d wgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2212544074 1111691860 1000406280 2976393539
+conv2d wgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 841696772 3782006390 2081693980 141423768
+conv2d wgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 841696772 3782006390 2081693980 1807012226
+conv2d wgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 841696772 3782006390 2081693980 4065587708
+conv2d wgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 841696772 3782006390 2081693980 3621154478
+conv2d wgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3869576724 4018387624 2099843274 3166971951
+conv2d wgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3869576724 4018387624 2099843274 710495809
+conv2d wgrad_1x32x32x16_6x16_32x3x3_pad_h1w1_stride_h6w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2106265431 1007447386 3357056235 3446932548
+conv2d wgrad_1x32x32x16_6x16_32x3x3_pad_h1w1_stride_h6w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2106265431 1007447386 3357056235 344178277
+conv2d wgrad_32x24x32x32_24x31_32x1x2_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2519843766 4040055257 852881505 1633246355
+conv2d wgrad_32x24x32x32_24x31_32x1x2_pad_h0w0_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2519843766 4040055257 852881505 778858244
+conv2d wgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4232110637 428926092 1217669626 1090782686
+conv2d wgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4232110637 428926092 1217669626 1523455590
+conv2d wgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3630316758 1787889194 2989357662 3134613422
+conv2d wgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3630316758 1787889194 2989357662 1158845444
+conv2d wgrad_1x124x224x96_229x129_24x7x7_pad_h3w3_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 866609547 857847029 3333097025 3239572160
+conv2d wgrad_1x124x224x96_229x129_24x7x7_pad_h3w3_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 866609547 857847029 3333097025 2436320376
+conv2d wgrad_1x233x35x48_233x35_24x7x5_pad_h3w2_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2538496988 3294874067 3443985888 4083335585
+conv2d wgrad_1x233x35x48_233x35_24x7x5_pad_h3w2_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2538496988 3294874067 3443985888 3302917564
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 531242570 422976060 2227466455
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 531242570 422976060 2227466455
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 531242570 422976060 2227466455
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 531242570 422976060 2227466455
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 531242570 422976060 2227466455
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 531242570 422976060 2227466455
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 531242570 422976060 2227466455
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 531242570 422976060 2227466455
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 531242570 422976060 2227466455
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 531242570 422976060 2227466455
+conv2d wgrad_1x1x1x8_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 558763846 3282100285 1585402383 820104967
+conv2d wgrad_1x1x1x8_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 558763846 3282100285 1585402383 820104967
+conv2d wgrad_1x1x8x8_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2125023038 1152388039 1919973277 4103626856
+conv2d wgrad_1x1x8x8_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2125023038 1152388039 1919973277 4164492975
+conv2d wgrad_1x7x8x8_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2793112873 2527774459 2695215306 1519678771
+conv2d wgrad_1x7x8x8_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2793112873 2527774459 2695215306 4056436284
+conv2d wgrad_1x7x9x8_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 390767101 3488761154 1737428004 2359527095
+conv2d wgrad_1x7x9x8_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 390767101 3488761154 1737428004 2240340004
+conv2d wgrad_2x7x9x8_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2346452629 2242491745 475190714 3363555017
+conv2d wgrad_2x7x9x8_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2346452629 2242491745 475190714 2240937477
+conv2d wgrad_3x7x9x8_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 695385844 1231166805 3030543492 1587793687
+conv2d wgrad_3x7x9x8_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 695385844 1231166805 3030543492 4129044530
+conv2d wgrad_3x7x9x8_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 118714643 1231166805 3002932066 1436246794
+conv2d wgrad_3x7x9x8_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 118714643 1231166805 3002932066 2860692730
+conv2d wgrad_3x7x9x8_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2078642274 1231166805 2704246666 3831504826
+conv2d wgrad_3x7x9x8_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2078642274 1231166805 2704246666 932276705
+conv2d wgrad_1x11x7x8_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2125023038 3952246738 1585402383 584217576
+conv2d wgrad_1x11x7x8_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2125023038 3952246738 1585402383 584217576
+conv2d wgrad_1x11x7x8_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2125023038 3952246738 2695215306 644274899
+conv2d wgrad_1x11x7x8_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2125023038 3952246738 2695215306 2070383610
+conv2d wgrad_1x13x11x8_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2793112873 3269716347 1585402383 2091838742
+conv2d wgrad_1x13x11x8_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2793112873 3269716347 1585402383 2091838742
+conv2d wgrad_1x17x19x8_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 624119942 2208297031 3036494114 1391639360
+conv2d wgrad_1x17x19x8_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 624119942 2208297031 3036494114 2853291206
+conv2d wgrad_1x23x5x8_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 118714643 3638907161 2381193183 4076043307
+conv2d wgrad_1x23x5x8_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 118714643 3638907161 2381193183 162466399
+conv2d wgrad_1x13x17x8_6x8_24x3x3_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3912955093 1748311260 3097125139 1760368998
+conv2d wgrad_1x13x17x8_6x8_24x3x3_pad_h0w0_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3912955093 1748311260 3097125139 370487040
+conv2d wgrad_1x23x21x8_8x7_24x3x3_pad_h1w1_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2380180646 950177617 3097125139 2441672462
+conv2d wgrad_1x23x21x8_8x7_24x3x3_pad_h1w1_stride_h3w3_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2380180646 950177617 3097125139 4093130692
+conv2d wgrad_1x20x24x8_8x10_40x3x3_pad_h3w3_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4197955097 3223936144 3268227320 3407045986
+conv2d wgrad_1x20x24x8_8x10_40x3x3_pad_h3w3_stride_h3w3_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4197955097 3223936144 3268227320 3327506975
+conv2d wgrad_1x15x19x160_15x19_224x1x1_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1371340519 2413111238 4020828414 3782469663
+conv2d wgrad_1x15x19x160_15x19_224x1x1_pad_h0w0_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1371340519 2413111238 4020828414 3782469663
+conv2d wgrad_1x19x37x160_10x19_224x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4064585156 1849062825 2621782130 1997835815
+conv2d wgrad_1x19x37x160_10x19_224x3x3_pad_h1w1_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4064585156 1849062825 2621782130 2176605786
+conv2d wgrad_1x16x16x160_17x16_224x2x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2815270590 1028019427 3062073018 1532225578
+conv2d wgrad_1x16x16x160_17x16_224x2x3_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2815270590 1028019427 3062073018 3175924537
+conv2d wgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2371881721 665408258 4119019906 3070068838
+conv2d wgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2371881721 665408258 4119019906 4095170522
+conv2d wgrad_1x29x37x160_29x37_224x5x5_pad_h2w2_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2314925659 1329921014 2730633195 1859011130
+conv2d wgrad_1x29x37x160_29x37_224x5x5_pad_h2w2_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2314925659 1329921014 2730633195 1829382848
+conv2d wgrad_1x15x19x40_15x19_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1327011661 654394569 180842421 456267566
+conv2d wgrad_1x15x19x40_15x19_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1327011661 654394569 180842421 4065629708
+conv2d wgrad_1x16x24x72_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2223985070 792561864 3949436648 2346117136
+conv2d wgrad_1x16x24x72_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2223985070 792561864 3949436648 3375703263
+conv2d wgrad_1x13x16x288_7x8_160x5x5_pad_h2w2_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1663038387 2182213461 3596813277 2680766255
+conv2d wgrad_1x13x16x288_7x8_160x5x5_pad_h2w2_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1663038387 2182213461 3596813277 819444109
+conv2d wgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2518156720 4284293314 3730143675 2330242371
+conv2d wgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2518156720 4284293314 3730143675 2330242371
+conv2d wgrad_1x71x80x32_36x40_64x5x5_pad_h2w2_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1153452126 1488124595 3162712336 2186928678
+conv2d wgrad_1x71x80x32_36x40_64x5x5_pad_h2w2_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1153452126 1488124595 3162712336 4038147074
+conv2d wgrad_1x224x224x8_112x112_64x7x7_pad_h3w3_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3557673531 1324698024 1813717058 1491574211
+conv2d wgrad_1x224x224x8_112x112_64x7x7_pad_h3w3_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3557673531 1324698024 1813717058 664642450
+conv2d wgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3373735934 1617738198 2081693980 1691976762
+conv2d wgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3373735934 1617738198 2081693980 1352928468
+conv2d wgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1883441731 169231440 2081693980 4154079864
+conv2d wgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1883441731 169231440 2081693980 49240552
+conv2d wgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2212544074 1111691860 1000406280 1708014479
+conv2d wgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2212544074 1111691860 1000406280 2976393539
+conv2d wgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 841696772 3782006390 2081693980 141423768
+conv2d wgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 841696772 3782006390 2081693980 1807012226
+conv2d wgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 841696772 3782006390 2081693980 4065587708
+conv2d wgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 841696772 3782006390 2081693980 3621154478
+conv2d wgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3869576724 4018387624 2099843274 3166971951
+conv2d wgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3869576724 4018387624 2099843274 710495809
+conv2d wgrad_1x32x32x16_6x16_32x3x3_pad_h1w1_stride_h6w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2106265431 1007447386 3357056235 3446932548
+conv2d wgrad_1x32x32x16_6x16_32x3x3_pad_h1w1_stride_h6w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2106265431 1007447386 3357056235 344178277
+conv2d wgrad_32x24x32x32_24x31_32x1x2_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2519843766 4040055257 852881505 1633246355
+conv2d wgrad_32x24x32x32_24x31_32x1x2_pad_h0w0_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2519843766 4040055257 852881505 778858244
+conv2d wgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4232110637 428926092 1217669626 1090782686
+conv2d wgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4232110637 428926092 1217669626 1523455590
+conv2d wgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3630316758 1787889194 2989357662 3134613422
+conv2d wgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3630316758 1787889194 2989357662 1158845444
+conv2d wgrad_1x124x224x96_229x129_24x7x7_pad_h3w3_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 866609547 857847029 3333097025 3239572160
+conv2d wgrad_1x124x224x96_229x129_24x7x7_pad_h3w3_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 866609547 857847029 3333097025 2436320376
+conv2d wgrad_1x233x35x48_233x35_24x7x5_pad_h3w2_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2538496988 3294874067 3443985888 4083335585
+conv2d wgrad_1x233x35x48_233x35_24x7x5_pad_h3w2_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2538496988 3294874067 3443985888 3302917564
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 531242570 422976060 2227466455
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 531242570 422976060 2227466455
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 531242570 422976060 2227466455
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 531242570 422976060 2227466455
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 531242570 422976060 2227466455
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 531242570 422976060 2227466455
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 531242570 422976060 2227466455
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 531242570 422976060 2227466455
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 531242570 422976060 2227466455
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 531242570 422976060 2227466455
+conv2d wgrad_1x4x4x12_1x1_8x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 542905940 2411745115 1138813722 1312614816
+conv2d wgrad_1x4x4x12_1x1_8x3x3_pad_h0w0_stride_h3w3_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 542905940 2411745115 1138813722 4095449788
+conv2d wgrad_1x1x1x8_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 558763846 3282100285 1585402383 820104967
+conv2d wgrad_1x1x1x8_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 558763846 3282100285 1585402383 820104967
+conv2d wgrad_1x1x8x8_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2125023038 1152388039 1919973277 4103626856
+conv2d wgrad_1x1x8x8_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2125023038 1152388039 1919973277 4164492975
+conv2d wgrad_1x7x8x8_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2793112873 2527774459 2695215306 1519678771
+conv2d wgrad_1x7x8x8_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2793112873 2527774459 2695215306 4056436284
+conv2d wgrad_1x7x9x8_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 390767101 3488761154 1737428004 2359527095
+conv2d wgrad_1x7x9x8_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 390767101 3488761154 1737428004 2240340004
+conv2d wgrad_2x7x9x8_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2346452629 2242491745 475190714 3363555017
+conv2d wgrad_2x7x9x8_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2346452629 2242491745 475190714 2240937477
+conv2d wgrad_3x7x9x8_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 695385844 1231166805 3030543492 1587793687
+conv2d wgrad_3x7x9x8_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 695385844 1231166805 3030543492 4129044530
+conv2d wgrad_3x7x9x8_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 118714643 1231166805 3002932066 1436246794
+conv2d wgrad_3x7x9x8_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 118714643 1231166805 3002932066 2860692730
+conv2d wgrad_3x7x9x8_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2078642274 1231166805 2704246666 3831504826
+conv2d wgrad_3x7x9x8_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2078642274 1231166805 2704246666 932276705
+conv2d wgrad_1x11x7x8_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2125023038 3952246738 1585402383 584217576
+conv2d wgrad_1x11x7x8_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2125023038 3952246738 1585402383 584217576
+conv2d wgrad_1x11x7x8_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2125023038 3952246738 2695215306 644274899
+conv2d wgrad_1x11x7x8_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2125023038 3952246738 2695215306 2070383610
+conv2d wgrad_1x13x11x8_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2793112873 3269716347 1585402383 2091838742
+conv2d wgrad_1x13x11x8_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2793112873 3269716347 1585402383 2091838742
+conv2d wgrad_1x17x19x8_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 624119942 2208297031 3036494114 1391639360
+conv2d wgrad_1x17x19x8_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 624119942 2208297031 3036494114 2853291206
+conv2d wgrad_1x23x5x8_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 118714643 3638907161 2381193183 4076043307
+conv2d wgrad_1x23x5x8_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 118714643 3638907161 2381193183 162466399
+conv2d wgrad_1x13x17x8_6x8_24x3x3_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3912955093 1748311260 3097125139 1760368998
+conv2d wgrad_1x13x17x8_6x8_24x3x3_pad_h0w0_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3912955093 1748311260 3097125139 370487040
+conv2d wgrad_1x23x21x8_8x7_24x3x3_pad_h1w1_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2380180646 950177617 3097125139 2441672462
+conv2d wgrad_1x23x21x8_8x7_24x3x3_pad_h1w1_stride_h3w3_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2380180646 950177617 3097125139 4093130692
+conv2d wgrad_1x20x24x8_8x10_40x3x3_pad_h3w3_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4197955097 3223936144 3268227320 3407045986
+conv2d wgrad_1x20x24x8_8x10_40x3x3_pad_h3w3_stride_h3w3_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4197955097 3223936144 3268227320 3327506975
+conv2d wgrad_1x15x19x160_15x19_224x1x1_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1371340519 2413111238 4020828414 3782469663
+conv2d wgrad_1x15x19x160_15x19_224x1x1_pad_h0w0_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1371340519 2413111238 4020828414 3782469663
+conv2d wgrad_1x19x37x160_10x19_224x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4064585156 1849062825 2621782130 1997835815
+conv2d wgrad_1x19x37x160_10x19_224x3x3_pad_h1w1_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4064585156 1849062825 2621782130 2176605786
+conv2d wgrad_1x16x16x160_17x16_224x2x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2815270590 1028019427 3062073018 1532225578
+conv2d wgrad_1x16x16x160_17x16_224x2x3_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2815270590 1028019427 3062073018 3175924537
+conv2d wgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2371881721 665408258 4119019906 3070068838
+conv2d wgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2371881721 665408258 4119019906 4095170522
+conv2d wgrad_1x29x37x160_29x37_224x5x5_pad_h2w2_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2314925659 1329921014 2730633195 1859011130
+conv2d wgrad_1x29x37x160_29x37_224x5x5_pad_h2w2_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2314925659 1329921014 2730633195 1829382848
+conv2d wgrad_1x15x19x40_15x19_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1327011661 654394569 180842421 456267566
+conv2d wgrad_1x15x19x40_15x19_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1327011661 654394569 180842421 4065629708
+conv2d wgrad_1x16x24x72_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2223985070 792561864 3949436648 2346117136
+conv2d wgrad_1x16x24x72_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2223985070 792561864 3949436648 3375703263
+conv2d wgrad_1x13x16x288_7x8_160x5x5_pad_h2w2_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1663038387 2182213461 3596813277 2680766255
+conv2d wgrad_1x13x16x288_7x8_160x5x5_pad_h2w2_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1663038387 2182213461 3596813277 819444109
+conv2d wgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2518156720 4284293314 3730143675 2330242371
+conv2d wgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2518156720 4284293314 3730143675 2330242371
+conv2d wgrad_1x71x80x32_36x40_64x5x5_pad_h2w2_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1153452126 1488124595 3162712336 2186928678
+conv2d wgrad_1x71x80x32_36x40_64x5x5_pad_h2w2_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1153452126 1488124595 3162712336 4038147074
+conv2d wgrad_1x224x224x8_112x112_64x7x7_pad_h3w3_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3557673531 1324698024 1813717058 1491574211
+conv2d wgrad_1x224x224x8_112x112_64x7x7_pad_h3w3_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3557673531 1324698024 1813717058 664642450
+conv2d wgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3373735934 1617738198 2081693980 1691976762
+conv2d wgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3373735934 1617738198 2081693980 1352928468
+conv2d wgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1883441731 169231440 2081693980 4154079864
+conv2d wgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1883441731 169231440 2081693980 49240552
+conv2d wgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2212544074 1111691860 1000406280 1708014479
+conv2d wgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2212544074 1111691860 1000406280 2976393539
+conv2d wgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 841696772 3782006390 2081693980 141423768
+conv2d wgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 841696772 3782006390 2081693980 1807012226
+conv2d wgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 841696772 3782006390 2081693980 4065587708
+conv2d wgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 841696772 3782006390 2081693980 3621154478
+conv2d wgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3869576724 4018387624 2099843274 3166971951
+conv2d wgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3869576724 4018387624 2099843274 710495809
+conv2d wgrad_1x32x32x16_6x16_32x3x3_pad_h1w1_stride_h6w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2106265431 1007447386 3357056235 3446932548
+conv2d wgrad_1x32x32x16_6x16_32x3x3_pad_h1w1_stride_h6w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2106265431 1007447386 3357056235 344178277
+conv2d wgrad_32x24x32x32_24x31_32x1x2_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2519843766 4040055257 852881505 1633246355
+conv2d wgrad_32x24x32x32_24x31_32x1x2_pad_h0w0_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2519843766 4040055257 852881505 778858244
+conv2d wgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4232110637 428926092 1217669626 1090782686
+conv2d wgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4232110637 428926092 1217669626 1523455590
+conv2d wgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3630316758 1787889194 2989357662 3134613422
+conv2d wgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3630316758 1787889194 2989357662 1158845444
+conv2d wgrad_1x124x224x96_229x129_24x7x7_pad_h3w3_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 866609547 857847029 3333097025 3239572160
+conv2d wgrad_1x124x224x96_229x129_24x7x7_pad_h3w3_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 866609547 857847029 3333097025 2436320376
+conv2d wgrad_1x233x35x48_233x35_24x7x5_pad_h3w2_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2538496988 3294874067 3443985888 4083335585
+conv2d wgrad_1x233x35x48_233x35_24x7x5_pad_h3w2_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2538496988 3294874067 3443985888 3302917564
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 531242570 422976060 2227466455
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 531242570 422976060 2227466455
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 531242570 422976060 2227466455
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 531242570 422976060 2227466455
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 531242570 422976060 2227466455
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 531242570 422976060 2227466455
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 531242570 422976060 2227466455
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 531242570 422976060 2227466455
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 531242570 422976060 2227466455
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 531242570 422976060 2227466455
+conv2d wgrad_1x4x4x12_1x1_8x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 542905940 2411745115 1138813722 1312614816
+conv2d wgrad_1x4x4x12_1x1_8x3x3_pad_h0w0_stride_h3w3_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 542905940 2411745115 1138813722 4095449788
 conv2d wgrad_1x1x1x8_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 558763846 3282100285 1585402383 820104967
 conv2d wgrad_1x1x1x8_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 558763846 3282100285 1585402383 820104967
 conv2d wgrad_1x1x8x8_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2125023038 1152388039 1919973277 4103626856
@@ -195,8 +713,161 @@ conv2d wgrad_1x124x224x96_229x129_24x7x7_pad_h3w3_stride_h1w1_dil_h1w1_conv_alph
 conv2d wgrad_1x233x35x48_233x35_24x7x5_pad_h3w2_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2538496988 3294874067 3443985888 4083335585
 conv2d wgrad_1x233x35x48_233x35_24x7x5_pad_h3w2_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2538496988 3294874067 3443985888 3302917564
 conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 531242570 422976060 2227466455
-conv2d wgrad_1x4x4x12_1x1_8x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 542905940 2411745115 1138813722 1312614816
-conv2d wgrad_1x4x4x12_1x1_8x3x3_pad_h0w0_stride_h3w3_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 542905940 2411745115 1138813722 4095449788
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 531242570 422976060 2227466455
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 531242570 422976060 2227466455
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 531242570 422976060 2227466455
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 531242570 422976060 2227466455
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 531242570 422976060 2227466455
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 531242570 422976060 2227466455
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 531242570 422976060 2227466455
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 531242570 422976060 2227466455
+conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2329920945 531242570 422976060 2227466455
+conv2d fprop_1x8x8x8_4x4_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1097684943 866495443 918737046 71602759
+conv2d fprop_1x8x8x8_4x4_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1097684943 866495443 918737046 2919022357
+conv2d fprop_1x224x224x8_220x220_32x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 961566038 1919943812 2697224587 1797794561
+conv2d fprop_1x224x224x8_220x220_32x7x7_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 961566038 1919943812 2697224587 2255708828
+conv2d fprop_1x224x224x8_110x110_64x7x7_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 961566038 2529610490 2495190031 3161822626
+conv2d fprop_1x224x224x8_110x110_64x7x7_pad_h1w1_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 961566038 2529610490 2495190031 2075196794
+conv2d fprop_1x224x224x8_222x222_64x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 961566038 259130347 4030859504 555709962
+conv2d fprop_1x224x224x8_222x222_64x5x5_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 961566038 259130347 4030859504 2780813035
+conv2d fprop_1x224x224x8_111x111_64x5x5_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 961566038 259130347 2177645918 3721804249
+conv2d fprop_1x224x224x8_111x111_64x5x5_pad_h1w1_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 961566038 259130347 2177645918 3715581640
+conv2d fprop_1x1x1x8_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 542905940 1152388039 4158385867 734550470
+conv2d fprop_1x1x1x8_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 542905940 1152388039 4158385867 734550470
+conv2d fprop_1x1x8x8_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1585175937 2411745115 3960856459 2498726848
+conv2d fprop_1x1x8x8_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1585175937 2411745115 3960856459 2808936497
+conv2d fprop_1x7x8x8_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2793112873 4021016722 3736889909 775549951
+conv2d fprop_1x7x8x8_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2793112873 4021016722 3736889909 999245167
+conv2d fprop_1x7x9x8_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3892602751 3987447721 1048231784 3578965759
+conv2d fprop_1x7x9x8_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3892602751 3987447721 1048231784 3635477992
+conv2d fprop_2x7x9x8_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1647798936 1986983586 2046300236 942258126
+conv2d fprop_2x7x9x8_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1647798936 1986983586 2046300236 3024889990
+conv2d fprop_3x7x9x8_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 664571366 3844983048 1063165266 1754355425
+conv2d fprop_3x7x9x8_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 664571366 3844983048 1063165266 3524504078
+conv2d fprop_3x7x9x8_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 664571366 741820436 4160104467 1904546220
+conv2d fprop_3x7x9x8_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 664571366 741820436 4160104467 3380363267
+conv2d fprop_3x7x9x8_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 664571366 2188679282 2508923337 202634559
+conv2d fprop_3x7x9x8_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 664571366 2188679282 2508923337 129526502
+conv2d fprop_1x11x7x8_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3376807219 1152388039 3960856459 687707885
+conv2d fprop_1x11x7x8_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3376807219 1152388039 3960856459 687707885
+conv2d fprop_1x11x7x8_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3376807219 4021016722 3960856459 551076377
+conv2d fprop_1x11x7x8_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3376807219 4021016722 3960856459 2736700182
+conv2d fprop_1x13x11x8_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3708184432 1152388039 3736889909 3070080445
+conv2d fprop_1x13x11x8_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3708184432 1152388039 3736889909 3070080445
+conv2d fprop_1x17x19x8_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3793481427 2784674253 206323080 4139449287
+conv2d fprop_1x17x19x8_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3793481427 2784674253 206323080 37247821
+conv2d fprop_1x23x5x8_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 724504571 866495443 4160104467 4166513007
+conv2d fprop_1x23x5x8_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 724504571 866495443 4160104467 4147817279
+conv2d fprop_1x13x17x8_6x8_24x3x3_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3696992523 2499748007 798339031 2425875414
+conv2d fprop_1x13x17x8_6x8_24x3x3_pad_h0w0_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3696992523 2499748007 798339031 3318636886
+conv2d fprop_1x23x21x8_8x7_24x3x3_pad_h1w1_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3999848747 2499748007 3897780943 1663875047
+conv2d fprop_1x23x21x8_8x7_24x3x3_pad_h1w1_stride_h3w3_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3999848747 2499748007 3897780943 433625993
+conv2d fprop_1x20x24x8_8x10_40x3x3_pad_h3w3_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1193090006 2131084833 1626339432 2602241257
+conv2d fprop_1x20x24x8_8x10_40x3x3_pad_h3w3_stride_h3w3_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1193090006 2131084833 1626339432 2541313003
+conv2d fprop_1x224x224x8_112x112_64x7x7_pad_h3w3_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 961566038 2529610490 3633207144 2483933988
+conv2d fprop_1x224x224x8_112x112_64x7x7_pad_h3w3_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 961566038 2529610490 3633207144 1649841999
+conv2d fprop_1x8x8x4_4x4_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1166358629 4021016722 918737046 2801323914
+conv2d fprop_1x8x8x4_4x4_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1166358629 4021016722 918737046 799895052
+conv2d fprop_1x224x224x4_220x220_32x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1421336162 1234198914 2697224587 684110079
+conv2d fprop_1x224x224x4_220x220_32x7x7_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1421336162 1234198914 2697224587 2499076303
+conv2d fprop_1x224x224x4_110x110_64x7x7_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1421336162 1919943812 2495190031 3897707542
+conv2d fprop_1x224x224x4_110x110_64x7x7_pad_h1w1_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1421336162 1919943812 2495190031 4107261292
+conv2d fprop_1x224x224x4_222x222_64x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1421336162 3432118660 4030859504 807043958
+conv2d fprop_1x224x224x4_222x222_64x5x5_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1421336162 3432118660 4030859504 1076112889
+conv2d fprop_1x224x224x4_111x111_64x5x5_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1421336162 3432118660 2177645918 1505518290
+conv2d fprop_1x224x224x4_111x111_64x5x5_pad_h1w1_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1421336162 3432118660 2177645918 2678526390
+conv2d fprop_1x8x8x2_4x4_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 615785832 4009940293 918737046 3181175801
+conv2d fprop_1x8x8x2_4x4_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 615785832 4009940293 918737046 1429353926
+conv2d fprop_1x224x224x2_220x220_32x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4108139691 2188679282 2697224587 2923215900
+conv2d fprop_1x224x224x2_220x220_32x7x7_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4108139691 2188679282 2697224587 855984613
+conv2d fprop_1x224x224x2_110x110_64x7x7_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4108139691 1234198914 2495190031 1491449988
+conv2d fprop_1x224x224x2_110x110_64x7x7_pad_h1w1_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4108139691 1234198914 2495190031 2305792574
+conv2d fprop_1x224x224x2_222x222_64x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4108139691 1258291265 4030859504 4150734559
+conv2d fprop_1x224x224x2_222x222_64x5x5_pad_h1w1_stride_h1w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4108139691 1258291265 4030859504 3871852370
+conv2d fprop_1x224x224x2_111x111_64x5x5_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4108139691 1258291265 2177645918 3745529229
+conv2d fprop_1x224x224x2_111x111_64x5x5_pad_h1w1_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4108139691 1258291265 2177645918 846250307
+conv2d dgrad_1x11x7x8_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2125023038 1152388039 2564029204 3847371704
+conv2d dgrad_1x11x7x8_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2125023038 1152388039 2564029204 3847371704
+conv2d dgrad_1x11x7x8_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2125023038 4021016722 2564029204 2275490218
+conv2d dgrad_1x11x7x8_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2125023038 4021016722 2564029204 2132854116
+conv2d dgrad_1x13x11x8_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2793112873 1152388039 1676221042 3084298807
+conv2d dgrad_1x13x11x8_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2793112873 1152388039 1676221042 3084298807
+conv2d dgrad_1x17x19x8_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 624119942 2784674253 191365871 3758840577
+conv2d dgrad_1x17x19x8_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 624119942 2784674253 191365871 201424948
+conv2d dgrad_1x23x5x8_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 118714643 866495443 4038686961 1705939247
+conv2d dgrad_1x23x5x8_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 118714643 866495443 4038686961 2657530738
+conv2d dgrad_1x13x17x8_6x8_24x3x3_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3912955093 2499748007 688732356 3362895769
+conv2d dgrad_1x13x17x8_6x8_24x3x3_pad_h0w0_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3912955093 2499748007 688732356 4133050271
+conv2d dgrad_1x23x21x8_8x7_24x3x3_pad_h1w1_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2380180646 2499748007 3810393490 1868454698
+conv2d dgrad_1x23x21x8_8x7_24x3x3_pad_h1w1_stride_h3w3_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2380180646 2499748007 3810393490 4121127243
+conv2d dgrad_1x20x24x8_8x10_40x3x3_pad_h3w3_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4197955097 2131084833 2598385656 2382857802
+conv2d dgrad_1x20x24x8_8x10_40x3x3_pad_h3w3_stride_h3w3_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4197955097 2131084833 2598385656 3397661960
+conv2d dgrad_1x19x37x160_10x19_224x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4064585156 3867270480 2925287275 2208597029
+conv2d dgrad_1x19x37x160_10x19_224x3x3_pad_h1w1_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4064585156 3867270480 2925287275 2274812992
+conv2d dgrad_1x13x16x288_7x8_160x5x5_pad_h2w2_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1663038387 1961598694 1379168514 1077915308
+conv2d dgrad_1x13x16x288_7x8_160x5x5_pad_h2w2_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1663038387 1961598694 1379168514 397243715
+conv2d dgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2518156720 3832095556 2917865891 4107392040
+conv2d dgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2518156720 3832095556 2917865891 4107392040
+conv2d dgrad_1x71x80x32_36x40_64x5x5_pad_h2w2_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1153452126 808380753 3243383160 254649230
+conv2d dgrad_1x71x80x32_36x40_64x5x5_pad_h2w2_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1153452126 808380753 3243383160 1326012988
+conv2d dgrad_1x224x224x8_112x112_64x7x7_pad_h3w3_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3557673531 2529610490 78092922 937037280
+conv2d dgrad_1x224x224x8_112x112_64x7x7_pad_h3w3_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3557673531 2529610490 78092922 1858025388
+conv2d dgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3373735934 4021646304 2515450069 981269351
+conv2d dgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3373735934 4021646304 2515450069 1054201594
+conv2d dgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1883441731 4021646304 2499255921 2519894620
+conv2d dgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1883441731 4021646304 2499255921 779628000
+conv2d dgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2212544074 2244058322 2208482181 1483418711
+conv2d dgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2212544074 2244058322 2208482181 4167030585
+conv2d dgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 841696772 4021646304 3459396734 456980924
+conv2d dgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 841696772 4021646304 3459396734 2013696151
+conv2d dgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 841696772 4021646304 3459396734 3583381428
+conv2d dgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 841696772 4021646304 3459396734 3556307685
+conv2d dgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3869576724 3796154296 4084859749 2972648305
+conv2d dgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3869576724 3796154296 4084859749 2483086888
+conv2d dgrad_1x32x32x16_6x16_32x3x3_pad_h1w1_stride_h6w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2106265431 640419443 843165254 729442897
+conv2d dgrad_1x32x32x16_6x16_32x3x3_pad_h1w1_stride_h6w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2106265431 640419443 843165254 1567894631
+conv2d dgrad_1x56x56x8_28x28_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 3092573237 1152388039 1813717058 3398247359
+conv2d dgrad_1x11x7x8_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2125023038 1152388039 2564029204 3847371704
+conv2d dgrad_1x11x7x8_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2125023038 1152388039 2564029204 3847371704
+conv2d dgrad_1x11x7x8_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2125023038 4021016722 2564029204 2275490218
+conv2d dgrad_1x11x7x8_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2125023038 4021016722 2564029204 2132854116
+conv2d dgrad_1x13x11x8_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2793112873 1152388039 1676221042 3084298807
+conv2d dgrad_1x13x11x8_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2793112873 1152388039 1676221042 3084298807
+conv2d dgrad_1x17x19x8_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 624119942 2784674253 191365871 3758840577
+conv2d dgrad_1x17x19x8_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 624119942 2784674253 191365871 201424948
+conv2d dgrad_1x23x5x8_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 118714643 866495443 4038686961 1705939247
+conv2d dgrad_1x23x5x8_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 118714643 866495443 4038686961 2657530738
+conv2d dgrad_1x13x17x8_6x8_24x3x3_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3912955093 2499748007 688732356 3362895769
+conv2d dgrad_1x13x17x8_6x8_24x3x3_pad_h0w0_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3912955093 2499748007 688732356 4133050271
+conv2d dgrad_1x23x21x8_8x7_24x3x3_pad_h1w1_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2380180646 2499748007 3810393490 1868454698
+conv2d dgrad_1x23x21x8_8x7_24x3x3_pad_h1w1_stride_h3w3_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2380180646 2499748007 3810393490 4121127243
+conv2d dgrad_1x20x24x8_8x10_40x3x3_pad_h3w3_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4197955097 2131084833 2598385656 2382857802
+conv2d dgrad_1x20x24x8_8x10_40x3x3_pad_h3w3_stride_h3w3_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4197955097 2131084833 2598385656 3397661960
+conv2d dgrad_1x19x37x160_10x19_224x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4064585156 3867270480 2925287275 2208597029
+conv2d dgrad_1x19x37x160_10x19_224x3x3_pad_h1w1_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4064585156 3867270480 2925287275 2274812992
+conv2d dgrad_1x13x16x288_7x8_160x5x5_pad_h2w2_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1663038387 1961598694 1379168514 1077915308
+conv2d dgrad_1x13x16x288_7x8_160x5x5_pad_h2w2_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1663038387 1961598694 1379168514 397243715
+conv2d dgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2518156720 3832095556 2917865891 4107392040
+conv2d dgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2518156720 3832095556 2917865891 4107392040
+conv2d dgrad_1x71x80x32_36x40_64x5x5_pad_h2w2_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1153452126 808380753 3243383160 254649230
+conv2d dgrad_1x71x80x32_36x40_64x5x5_pad_h2w2_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1153452126 808380753 3243383160 1326012988
+conv2d dgrad_1x224x224x8_112x112_64x7x7_pad_h3w3_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3557673531 2529610490 78092922 937037280
+conv2d dgrad_1x224x224x8_112x112_64x7x7_pad_h3w3_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3557673531 2529610490 78092922 1858025388
+conv2d dgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3373735934 4021646304 2515450069 981269351
+conv2d dgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3373735934 4021646304 2515450069 1054201594
+conv2d dgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1883441731 4021646304 2499255921 2519894620
+conv2d dgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1883441731 4021646304 2499255921 779628000
+conv2d dgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2212544074 2244058322 2208482181 1483418711
+conv2d dgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2212544074 2244058322 2208482181 4167030585
+conv2d dgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 841696772 4021646304 3459396734 456980924
+conv2d dgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 841696772 4021646304 3459396734 2013696151
+conv2d dgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 841696772 4021646304 3459396734 3583381428
+conv2d dgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 841696772 4021646304 3459396734 3556307685
+conv2d dgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3869576724 3796154296 4084859749 2972648305
+conv2d dgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3869576724 3796154296 4084859749 2483086888
+conv2d dgrad_1x32x32x16_6x16_32x3x3_pad_h1w1_stride_h6w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2106265431 640419443 843165254 729442897
+conv2d dgrad_1x32x32x16_6x16_32x3x3_pad_h1w1_stride_h6w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2106265431 640419443 843165254 1567894631
+conv2d dgrad_1x56x56x8_28x28_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 3092573237 1152388039 1813717058 3398247359
 conv2d dgrad_1x11x7x8_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2125023038 1152388039 2564029204 3847371704
 conv2d dgrad_1x11x7x8_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2125023038 1152388039 2564029204 3847371704
 conv2d dgrad_1x11x7x8_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2125023038 4021016722 2564029204 2275490218
@@ -240,10 +911,133 @@ conv2d dgrad_1x32x32x16_6x16_32x3x3_pad_h1w1_stride_h6w2_dil_h1w1_conv_alpha1_be
 conv2d dgrad_1x56x56x8_28x28_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 3092573237 1152388039 1813717058 3398247359
 conv2d dgrad_1x4x4x12_1x1_8x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 542905940 1189071464 1919973277 257167289
 conv2d dgrad_1x4x4x12_1x1_8x3x3_pad_h0w0_stride_h3w3_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 542905940 1189071464 1919973277 419044577
+conv2d dgrad_1x11x7x8_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2125023038 1152388039 2564029204 3847371704
+conv2d dgrad_1x11x7x8_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2125023038 1152388039 2564029204 3847371704
+conv2d dgrad_1x11x7x8_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2125023038 4021016722 2564029204 2275490218
+conv2d dgrad_1x11x7x8_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2125023038 4021016722 2564029204 2132854116
+conv2d dgrad_1x13x11x8_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2793112873 1152388039 1676221042 3084298807
+conv2d dgrad_1x13x11x8_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2793112873 1152388039 1676221042 3084298807
+conv2d dgrad_1x17x19x8_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 624119942 2784674253 191365871 3758840577
+conv2d dgrad_1x17x19x8_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 624119942 2784674253 191365871 201424948
+conv2d dgrad_1x23x5x8_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 118714643 866495443 4038686961 1705939247
+conv2d dgrad_1x23x5x8_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 118714643 866495443 4038686961 2657530738
+conv2d dgrad_1x13x17x8_6x8_24x3x3_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3912955093 2499748007 688732356 3362895769
+conv2d dgrad_1x13x17x8_6x8_24x3x3_pad_h0w0_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3912955093 2499748007 688732356 4133050271
+conv2d dgrad_1x23x21x8_8x7_24x3x3_pad_h1w1_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2380180646 2499748007 3810393490 1868454698
+conv2d dgrad_1x23x21x8_8x7_24x3x3_pad_h1w1_stride_h3w3_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2380180646 2499748007 3810393490 4121127243
+conv2d dgrad_1x20x24x8_8x10_40x3x3_pad_h3w3_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4197955097 2131084833 2598385656 2382857802
+conv2d dgrad_1x20x24x8_8x10_40x3x3_pad_h3w3_stride_h3w3_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4197955097 2131084833 2598385656 3397661960
+conv2d dgrad_1x19x37x160_10x19_224x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4064585156 3867270480 2925287275 2208597029
+conv2d dgrad_1x19x37x160_10x19_224x3x3_pad_h1w1_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4064585156 3867270480 2925287275 2274812992
+conv2d dgrad_1x13x16x288_7x8_160x5x5_pad_h2w2_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1663038387 1961598694 1379168514 1077915308
+conv2d dgrad_1x13x16x288_7x8_160x5x5_pad_h2w2_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1663038387 1961598694 1379168514 397243715
+conv2d dgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2518156720 3832095556 2917865891 4107392040
+conv2d dgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2518156720 3832095556 2917865891 4107392040
+conv2d dgrad_1x71x80x32_36x40_64x5x5_pad_h2w2_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1153452126 808380753 3243383160 254649230
+conv2d dgrad_1x71x80x32_36x40_64x5x5_pad_h2w2_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1153452126 808380753 3243383160 1326012988
+conv2d dgrad_1x224x224x8_112x112_64x7x7_pad_h3w3_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3557673531 2529610490 78092922 937037280
+conv2d dgrad_1x224x224x8_112x112_64x7x7_pad_h3w3_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3557673531 2529610490 78092922 1858025388
+conv2d dgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3373735934 4021646304 2515450069 981269351
+conv2d dgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3373735934 4021646304 2515450069 1054201594
+conv2d dgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1883441731 4021646304 2499255921 2519894620
+conv2d dgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1883441731 4021646304 2499255921 779628000
+conv2d dgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2212544074 2244058322 2208482181 1483418711
+conv2d dgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2212544074 2244058322 2208482181 4167030585
+conv2d dgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 841696772 4021646304 3459396734 456980924
+conv2d dgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 841696772 4021646304 3459396734 2013696151
+conv2d dgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 841696772 4021646304 3459396734 3583381428
+conv2d dgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 841696772 4021646304 3459396734 3556307685
+conv2d dgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3869576724 3796154296 4084859749 2972648305
+conv2d dgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3869576724 3796154296 4084859749 2483086888
+conv2d dgrad_1x32x32x16_6x16_32x3x3_pad_h1w1_stride_h6w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2106265431 640419443 843165254 729442897
+conv2d dgrad_1x32x32x16_6x16_32x3x3_pad_h1w1_stride_h6w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2106265431 640419443 843165254 1567894631
+conv2d dgrad_1x56x56x8_28x28_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 3092573237 1152388039 1813717058 3398247359
+conv2d dgrad_1x11x7x8_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2125023038 1152388039 2564029204 3847371704
+conv2d dgrad_1x11x7x8_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2125023038 1152388039 2564029204 3847371704
+conv2d dgrad_1x11x7x8_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2125023038 4021016722 2564029204 2275490218
+conv2d dgrad_1x11x7x8_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2125023038 4021016722 2564029204 2132854116
+conv2d dgrad_1x13x11x8_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2793112873 1152388039 1676221042 3084298807
+conv2d dgrad_1x13x11x8_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2793112873 1152388039 1676221042 3084298807
+conv2d dgrad_1x17x19x8_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 624119942 2784674253 191365871 3758840577
+conv2d dgrad_1x17x19x8_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 624119942 2784674253 191365871 201424948
+conv2d dgrad_1x23x5x8_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 118714643 866495443 4038686961 1705939247
+conv2d dgrad_1x23x5x8_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 118714643 866495443 4038686961 2657530738
+conv2d dgrad_1x13x17x8_6x8_24x3x3_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3912955093 2499748007 688732356 3362895769
+conv2d dgrad_1x13x17x8_6x8_24x3x3_pad_h0w0_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3912955093 2499748007 688732356 4133050271
+conv2d dgrad_1x23x21x8_8x7_24x3x3_pad_h1w1_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2380180646 2499748007 3810393490 1868454698
+conv2d dgrad_1x23x21x8_8x7_24x3x3_pad_h1w1_stride_h3w3_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2380180646 2499748007 3810393490 4121127243
+conv2d dgrad_1x20x24x8_8x10_40x3x3_pad_h3w3_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4197955097 2131084833 2598385656 2382857802
+conv2d dgrad_1x20x24x8_8x10_40x3x3_pad_h3w3_stride_h3w3_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4197955097 2131084833 2598385656 3397661960
+conv2d dgrad_1x19x37x160_10x19_224x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4064585156 3867270480 2925287275 2208597029
+conv2d dgrad_1x19x37x160_10x19_224x3x3_pad_h1w1_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4064585156 3867270480 2925287275 2274812992
+conv2d dgrad_1x13x16x288_7x8_160x5x5_pad_h2w2_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1663038387 1961598694 1379168514 1077915308
+conv2d dgrad_1x13x16x288_7x8_160x5x5_pad_h2w2_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1663038387 1961598694 1379168514 397243715
+conv2d dgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2518156720 3832095556 2917865891 4107392040
+conv2d dgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2518156720 3832095556 2917865891 4107392040
+conv2d dgrad_1x71x80x32_36x40_64x5x5_pad_h2w2_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1153452126 808380753 3243383160 254649230
+conv2d dgrad_1x71x80x32_36x40_64x5x5_pad_h2w2_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1153452126 808380753 3243383160 1326012988
+conv2d dgrad_1x224x224x8_112x112_64x7x7_pad_h3w3_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3557673531 2529610490 78092922 937037280
+conv2d dgrad_1x224x224x8_112x112_64x7x7_pad_h3w3_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3557673531 2529610490 78092922 1858025388
+conv2d dgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3373735934 4021646304 2515450069 981269351
+conv2d dgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3373735934 4021646304 2515450069 1054201594
+conv2d dgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1883441731 4021646304 2499255921 2519894620
+conv2d dgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1883441731 4021646304 2499255921 779628000
+conv2d dgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2212544074 2244058322 2208482181 1483418711
+conv2d dgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2212544074 2244058322 2208482181 4167030585
+conv2d dgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 841696772 4021646304 3459396734 456980924
+conv2d dgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 841696772 4021646304 3459396734 2013696151
+conv2d dgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 841696772 4021646304 3459396734 3583381428
+conv2d dgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 841696772 4021646304 3459396734 3556307685
+conv2d dgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3869576724 3796154296 4084859749 2972648305
+conv2d dgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3869576724 3796154296 4084859749 2483086888
+conv2d dgrad_1x32x32x16_6x16_32x3x3_pad_h1w1_stride_h6w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2106265431 640419443 843165254 729442897
+conv2d dgrad_1x32x32x16_6x16_32x3x3_pad_h1w1_stride_h6w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2106265431 640419443 843165254 1567894631
+conv2d dgrad_1x56x56x8_28x28_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 3092573237 1152388039 1813717058 3398247359
 conv2d dgrad_1x56x56x12_28x28_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3092573237 1046014840 3129171903 3024531293
 conv2d dgrad_1x56x56x12_28x28_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3092573237 1046014840 3129171903 3024531293
 conv2d dgrad_1x55x55x12_28x28_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3092573237 1046014840 1050854832 1177039304
 conv2d dgrad_1x55x55x12_28x28_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3092573237 1046014840 1050854832 1177039304
+conv2d dgrad_1x11x7x8_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2125023038 1152388039 2564029204 3847371704
+conv2d dgrad_1x11x7x8_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2125023038 1152388039 2564029204 3847371704
+conv2d dgrad_1x11x7x8_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2125023038 4021016722 2564029204 2275490218
+conv2d dgrad_1x11x7x8_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2125023038 4021016722 2564029204 2132854116
+conv2d dgrad_1x13x11x8_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2793112873 1152388039 1676221042 3084298807
+conv2d dgrad_1x13x11x8_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2793112873 1152388039 1676221042 3084298807
+conv2d dgrad_1x17x19x8_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 624119942 2784674253 191365871 3758840577
+conv2d dgrad_1x17x19x8_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 624119942 2784674253 191365871 201424948
+conv2d dgrad_1x23x5x8_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 118714643 866495443 4038686961 1705939247
+conv2d dgrad_1x23x5x8_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 118714643 866495443 4038686961 2657530738
+conv2d dgrad_1x13x17x8_6x8_24x3x3_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3912955093 2499748007 688732356 3362895769
+conv2d dgrad_1x13x17x8_6x8_24x3x3_pad_h0w0_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3912955093 2499748007 688732356 4133050271
+conv2d dgrad_1x23x21x8_8x7_24x3x3_pad_h1w1_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2380180646 2499748007 3810393490 1868454698
+conv2d dgrad_1x23x21x8_8x7_24x3x3_pad_h1w1_stride_h3w3_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2380180646 2499748007 3810393490 4121127243
+conv2d dgrad_1x20x24x8_8x10_40x3x3_pad_h3w3_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4197955097 2131084833 2598385656 2382857802
+conv2d dgrad_1x20x24x8_8x10_40x3x3_pad_h3w3_stride_h3w3_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4197955097 2131084833 2598385656 3397661960
+conv2d dgrad_1x19x37x160_10x19_224x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4064585156 3867270480 2925287275 2208597029
+conv2d dgrad_1x19x37x160_10x19_224x3x3_pad_h1w1_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4064585156 3867270480 2925287275 2274812992
+conv2d dgrad_1x13x16x288_7x8_160x5x5_pad_h2w2_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1663038387 1961598694 1379168514 1077915308
+conv2d dgrad_1x13x16x288_7x8_160x5x5_pad_h2w2_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1663038387 1961598694 1379168514 397243715
+conv2d dgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2518156720 3832095556 2917865891 4107392040
+conv2d dgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2518156720 3832095556 2917865891 4107392040
+conv2d dgrad_1x71x80x32_36x40_64x5x5_pad_h2w2_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1153452126 808380753 3243383160 254649230
+conv2d dgrad_1x71x80x32_36x40_64x5x5_pad_h2w2_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1153452126 808380753 3243383160 1326012988
+conv2d dgrad_1x224x224x8_112x112_64x7x7_pad_h3w3_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3557673531 2529610490 78092922 937037280
+conv2d dgrad_1x224x224x8_112x112_64x7x7_pad_h3w3_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3557673531 2529610490 78092922 1858025388
+conv2d dgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3373735934 4021646304 2515450069 981269351
+conv2d dgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3373735934 4021646304 2515450069 1054201594
+conv2d dgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1883441731 4021646304 2499255921 2519894620
+conv2d dgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1883441731 4021646304 2499255921 779628000
+conv2d dgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2212544074 2244058322 2208482181 1483418711
+conv2d dgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2212544074 2244058322 2208482181 4167030585
+conv2d dgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 841696772 4021646304 3459396734 456980924
+conv2d dgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 841696772 4021646304 3459396734 2013696151
+conv2d dgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 841696772 4021646304 3459396734 3583381428
+conv2d dgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 841696772 4021646304 3459396734 3556307685
+conv2d dgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3869576724 3796154296 4084859749 2972648305
+conv2d dgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3869576724 3796154296 4084859749 2483086888
+conv2d dgrad_1x32x32x16_6x16_32x3x3_pad_h1w1_stride_h6w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2106265431 640419443 843165254 729442897
+conv2d dgrad_1x32x32x16_6x16_32x3x3_pad_h1w1_stride_h6w2_dil_h1w1_conv_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2106265431 640419443 843165254 1567894631
+conv2d dgrad_1x56x56x8_28x28_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 3092573237 1152388039 1813717058 3398247359
 conv3d wgrad_1x1x3x3x8_13x3_8x1x1_pad_d0h0w0_stride_d1h1w1_dil_d1h1w1_corr_alpha1_beta0 hndhwc_hndhwc_fndhwc_f_f 260402488 3429884614 1585402383 833795101
 conv3d wgrad_1x1x3x3x8_13x3_8x1x1_pad_d0h0w0_stride_d1h1w1_dil_d1h1w1_conv_alpha1_beta0 hndhwc_hndhwc_fndhwc_f_f 260402488 3429884614 1585402383 833795101
 conv3d wgrad_1x1x1x8x8_33x8_8x1x3_pad_d1h1w1_stride_d1h1w1_dil_d1h1w1_corr_alpha1_beta0 hndhwc_hndhwc_fndhwc_f_f 3289763692 3767246007 1919973277 3027845106
diff --git a/test/unit/epilogue/CMakeLists.txt b/test/unit/epilogue/CMakeLists.txt
index 9035ad52..f84423fb 100755
--- a/test/unit/epilogue/CMakeLists.txt
+++ b/test/unit/epilogue/CMakeLists.txt
@@ -1,23 +1,29 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 add_subdirectory(thread)
diff --git a/test/unit/epilogue/thread/CMakeLists.txt b/test/unit/epilogue/thread/CMakeLists.txt
index 3044bbdd..fcb9105b 100644
--- a/test/unit/epilogue/thread/CMakeLists.txt
+++ b/test/unit/epilogue/thread/CMakeLists.txt
@@ -1,23 +1,29 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 cutlass_test_unit_add_executable(
diff --git a/test/unit/epilogue/thread/activation.cu b/test/unit/epilogue/thread/activation.cu
index cdc1b8f6..041fded4 100644
--- a/test/unit/epilogue/thread/activation.cu
+++ b/test/unit/epilogue/thread/activation.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/epilogue/thread/linear_combination.cu b/test/unit/epilogue/thread/linear_combination.cu
index ae154363..77d7c23c 100644
--- a/test/unit/epilogue/thread/linear_combination.cu
+++ b/test/unit/epilogue/thread/linear_combination.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/epilogue/thread/linear_combination_planar_complex.cu b/test/unit/epilogue/thread/linear_combination_planar_complex.cu
index 1b4552b5..fa611925 100644
--- a/test/unit/epilogue/thread/linear_combination_planar_complex.cu
+++ b/test/unit/epilogue/thread/linear_combination_planar_complex.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/epilogue/threadblock/CMakeLists.txt b/test/unit/epilogue/threadblock/CMakeLists.txt
index 2e3d239e..17079354 100755
--- a/test/unit/epilogue/threadblock/CMakeLists.txt
+++ b/test/unit/epilogue/threadblock/CMakeLists.txt
@@ -1,23 +1,29 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 cutlass_test_unit_add_executable(
diff --git a/test/unit/epilogue/threadblock/epilogue_planar_complex.cu b/test/unit/epilogue/threadblock/epilogue_planar_complex.cu
index fbfda452..9373e7da 100644
--- a/test/unit/epilogue/threadblock/epilogue_planar_complex.cu
+++ b/test/unit/epilogue/threadblock/epilogue_planar_complex.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/epilogue/threadblock/epilogue_simt.cu b/test/unit/epilogue/threadblock/epilogue_simt.cu
index b2e9e477..bca6c247 100644
--- a/test/unit/epilogue/threadblock/epilogue_simt.cu
+++ b/test/unit/epilogue/threadblock/epilogue_simt.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/epilogue/threadblock/epilogue_simt_sm60.cu b/test/unit/epilogue/threadblock/epilogue_simt_sm60.cu
index 751278f9..880d490c 100644
--- a/test/unit/epilogue/threadblock/epilogue_simt_sm60.cu
+++ b/test/unit/epilogue/threadblock/epilogue_simt_sm60.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/epilogue/threadblock/epilogue_simt_sm61.cu b/test/unit/epilogue/threadblock/epilogue_simt_sm61.cu
index cca188e0..48c8be17 100644
--- a/test/unit/epilogue/threadblock/epilogue_simt_sm61.cu
+++ b/test/unit/epilogue/threadblock/epilogue_simt_sm61.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/epilogue/threadblock/epilogue_tensor_op.cu b/test/unit/epilogue/threadblock/epilogue_tensor_op.cu
index 5f91cbc8..696af7b5 100644
--- a/test/unit/epilogue/threadblock/epilogue_tensor_op.cu
+++ b/test/unit/epilogue/threadblock/epilogue_tensor_op.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -164,7 +170,7 @@ TEST(SM75_Epilogue_threadblock_epilogue, s4_tensor_op_64x64_32x32x32) {
   EXPECT_TRUE(passed);
 }
 
-TEST(SM75_Epilogue_threadblock_epilogue, s8_tensor_op_128x128_64x64x32) {
+TEST(SM75_Epilogue_threadblock_epilogue, s4_tensor_op_128x128_64x64x32) {
 
   //
   // Define the warp-level matrix multiply
@@ -173,7 +179,7 @@ TEST(SM75_Epilogue_threadblock_epilogue, s8_tensor_op_128x128_64x64x32) {
   using ElementOutput = cutlass::int4b_t;
   using ElementAccumulator = int;
   using ElementCompute = float;
-  int const kElementsPerAccess = 32 / cutlass::sizeof_bits<ElementOutput>::value;
+  int const kElementsPerAccess = 64 / cutlass::sizeof_bits<ElementOutput>::value;
   int const kPartitionsK = 1;
   
   using Shape = cutlass::gemm::GemmShape<128, 128, 32>;
@@ -291,7 +297,7 @@ TEST(SM75_Epilogue_threadblock_epilogue, s4_tensor_op_64x128_32x64x32) {
   using ElementOutput = cutlass::int4b_t;
   using ElementAccumulator = int;
   using ElementCompute = float;
-  int const kElementsPerAccess = 32 / cutlass::sizeof_bits<ElementOutput>::value;
+  int const kElementsPerAccess = 64 / cutlass::sizeof_bits<ElementOutput>::value;
   int const kPartitionsK = 1;
   
   using Shape = cutlass::gemm::GemmShape<64, 128, 32>;
@@ -350,7 +356,7 @@ TEST(SM75_Epilogue_threadblock_epilogue, s4_tensor_op_32x128_32x64x32) {
   using ElementOutput = cutlass::int4b_t;
   using ElementAccumulator = int;
   using ElementCompute = float;
-  int const kElementsPerAccess = 32 / cutlass::sizeof_bits<ElementOutput>::value;
+  int const kElementsPerAccess = 64 / cutlass::sizeof_bits<ElementOutput>::value;
   int const kPartitionsK = 1;
   
   using Shape = cutlass::gemm::GemmShape<32, 128, 32>;
@@ -469,7 +475,7 @@ TEST(SM75_Epilogue_threadblock_epilogue, s4_tensor_op_256x128_64x64x32) {
   using ElementOutput = cutlass::int4b_t;
   using ElementAccumulator = int;
   using ElementCompute = float;
-  int const kElementsPerAccess = 32 / cutlass::sizeof_bits<ElementOutput>::value;
+  int const kElementsPerAccess = 64 / cutlass::sizeof_bits<ElementOutput>::value;
   int const kPartitionsK = 1;
   
   using Shape = cutlass::gemm::GemmShape<256, 128, 32>;
diff --git a/test/unit/epilogue/threadblock/epilogue_volta_tensor_op.cu b/test/unit/epilogue/threadblock/epilogue_volta_tensor_op.cu
index c0fa2b9e..828e3941 100644
--- a/test/unit/epilogue/threadblock/epilogue_volta_tensor_op.cu
+++ b/test/unit/epilogue/threadblock/epilogue_volta_tensor_op.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/epilogue/threadblock/epilogue_with_reduction_tensor_op.cu b/test/unit/epilogue/threadblock/epilogue_with_reduction_tensor_op.cu
index 5433b16f..b6490154 100644
--- a/test/unit/epilogue/threadblock/epilogue_with_reduction_tensor_op.cu
+++ b/test/unit/epilogue/threadblock/epilogue_with_reduction_tensor_op.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/epilogue/threadblock/epilogue_with_reduction_testbed.h b/test/unit/epilogue/threadblock/epilogue_with_reduction_testbed.h
index c67ecbf9..903248ed 100644
--- a/test/unit/epilogue/threadblock/epilogue_with_reduction_testbed.h
+++ b/test/unit/epilogue/threadblock/epilogue_with_reduction_testbed.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/epilogue/threadblock/epilogue_wmma_tensor_op_sm70.cu b/test/unit/epilogue/threadblock/epilogue_wmma_tensor_op_sm70.cu
index 59121617..4606b845 100644
--- a/test/unit/epilogue/threadblock/epilogue_wmma_tensor_op_sm70.cu
+++ b/test/unit/epilogue/threadblock/epilogue_wmma_tensor_op_sm70.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/epilogue/threadblock/output_tile_threadmap.cu b/test/unit/epilogue/threadblock/output_tile_threadmap.cu
index 450f683a..7d434eaa 100644
--- a/test/unit/epilogue/threadblock/output_tile_threadmap.cu
+++ b/test/unit/epilogue/threadblock/output_tile_threadmap.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/epilogue/threadblock/predicated_tile_iterator.cu b/test/unit/epilogue/threadblock/predicated_tile_iterator.cu
index 6931422f..30443064 100644
--- a/test/unit/epilogue/threadblock/predicated_tile_iterator.cu
+++ b/test/unit/epilogue/threadblock/predicated_tile_iterator.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/epilogue/threadblock/testbed.h b/test/unit/epilogue/threadblock/testbed.h
index 4d885997..bf40b9ec 100644
--- a/test/unit/epilogue/threadblock/testbed.h
+++ b/test/unit/epilogue/threadblock/testbed.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/epilogue/threadblock/testbed_planar_complex.h b/test/unit/epilogue/threadblock/testbed_planar_complex.h
index 4f293125..f7c01b4c 100644
--- a/test/unit/epilogue/threadblock/testbed_planar_complex.h
+++ b/test/unit/epilogue/threadblock/testbed_planar_complex.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/epilogue/warp/CMakeLists.txt b/test/unit/epilogue/warp/CMakeLists.txt
index 704750ab..d9fb9576 100644
--- a/test/unit/epilogue/warp/CMakeLists.txt
+++ b/test/unit/epilogue/warp/CMakeLists.txt
@@ -1,23 +1,29 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 cutlass_test_unit_add_executable(
diff --git a/test/unit/epilogue/warp/fragment_iterator_tensor_op.cu b/test/unit/epilogue/warp/fragment_iterator_tensor_op.cu
index 27ae3ced..90f9d975 100644
--- a/test/unit/epilogue/warp/fragment_iterator_tensor_op.cu
+++ b/test/unit/epilogue/warp/fragment_iterator_tensor_op.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/epilogue/warp/fragment_iterator_volta_tensor_op.cu b/test/unit/epilogue/warp/fragment_iterator_volta_tensor_op.cu
index 71317e3e..9a82ccda 100644
--- a/test/unit/epilogue/warp/fragment_iterator_volta_tensor_op.cu
+++ b/test/unit/epilogue/warp/fragment_iterator_volta_tensor_op.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/epilogue/warp/fragment_iterator_wmma_tensor_op.cu b/test/unit/epilogue/warp/fragment_iterator_wmma_tensor_op.cu
index 5d656671..dad62b52 100644
--- a/test/unit/epilogue/warp/fragment_iterator_wmma_tensor_op.cu
+++ b/test/unit/epilogue/warp/fragment_iterator_wmma_tensor_op.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/CMakeLists.txt b/test/unit/gemm/CMakeLists.txt
index 06331287..b0cee068 100644
--- a/test/unit/gemm/CMakeLists.txt
+++ b/test/unit/gemm/CMakeLists.txt
@@ -1,23 +1,29 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 add_subdirectory(thread)
diff --git a/test/unit/gemm/device/CMakeLists.txt b/test/unit/gemm/device/CMakeLists.txt
index 51f28f39..2aca5c16 100644
--- a/test/unit/gemm/device/CMakeLists.txt
+++ b/test/unit/gemm/device/CMakeLists.txt
@@ -1,23 +1,29 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 add_custom_target(
@@ -130,6 +136,7 @@ cutlass_test_unit_add_executable(
   BATCH_SOURCES ON
   BATCH_SIZE 4
 
+  gemm_universal_f16n_f16t_f32n_tensor_op_f32_sm75.cu
   gemm_universal_f16n_f16t_f32t_tensor_op_f32_sm75.cu
 
   gemm_f16t_f16n_f16t_tensor_op_f16_sm75.cu
@@ -156,7 +163,9 @@ cutlass_test_unit_add_executable(
   gemm_s4n_s4t_s4n_tensor_op_s32_sm75.cu
   gemm_s4t_s4n_s32t_tensor_op_s32_sm75.cu
   gemm_s4t_s4n_s32n_tensor_op_s32_sm75.cu
-  
+  gemm_s4t_s4n_s4n_tensor_op_s32_sm75.cu 
+  gemm_s4t_s4n_s4t_tensor_op_s32_sm75.cu
+
   gemm_b1t_b1n_s32t_tensor_op_s32_sm75.cu
   gemm_b1t_b1n_s32n_tensor_op_s32_sm75.cu
 
@@ -244,6 +253,8 @@ cutlass_test_unit_add_executable(
   gemm_s8t_s8n_s8t_tensor_op_s32_sm80.cu
   gemm_s4t_s4n_s32n_tensor_op_s32_sm80.cu
   gemm_s4t_s4n_s32t_tensor_op_s32_sm80.cu
+  gemm_s4t_s4n_s4n_tensor_op_s32_sm80.cu 
+  gemm_s4t_s4n_s4t_tensor_op_s32_sm80.cu
   gemm_b1t_b1n_s32n_tensor_op_s32_sm80.cu
   gemm_b1t_b1n_s32t_tensor_op_s32_sm80.cu
 
@@ -383,3 +394,150 @@ cutlass_test_unit_add_executable(
 
 endif()
 
+if (NOT CUDA_COMPILER MATCHES "[Cc]lang")
+
+add_dependencies(
+  cutlass_test_unit_gemm_device
+  cutlass_test_unit_gemm_device_blas3
+  )
+
+add_dependencies(
+  test_unit_gemm_device
+  test_unit_gemm_device_blas3
+  )
+
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_gemm_device_blas3
+
+  BATCH_SOURCES ON
+  BATCH_SIZE 4
+
+  ## SYRK 
+  # Syrk SM80 f64 tests
+  syrk_f64n_f64t_tensor_op_f64_sm80.cu 
+  syrk_f64t_f64n_tensor_op_f64_sm80.cu 
+
+  # Syrk SM80 f32 tests
+  syrk_tf32n_f32t_tensor_op_f32_sm80.cu
+  syrk_tf32t_f32t_tensor_op_f32_sm80.cu
+  syrk_f32n_f32t_tensor_op_fast_f32_sm80.cu
+  syrk_f32t_f32t_tensor_op_fast_f32_sm80.cu
+
+  # Syrk SM80 complex f64 tests
+  syrk_cf64n_cf64t_tensor_op_f64_sm80.cu
+  syrk_cf64n_cf64n_tensor_op_f64_sm80.cu
+  syrk_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu
+
+  # Syrk SM80 complex f32 tests
+  syrk_cf32n_cf32t_tensor_op_f32_sm80.cu
+  syrk_cf32n_cf32n_tensor_op_f32_sm80.cu
+  syrk_cf32n_cf32t_tensor_op_fast_f32_sm80.cu
+  syrk_cf32n_cf32n_tensor_op_fast_f32_sm80.cu
+
+  ## HERK 
+  # Herk SM80 complex f64 tests
+  herk_cf64h_cf64n_tensor_op_f64_sm80.cu
+
+  # Herk SM80 complex f32 tests
+  herk_cf32h_cf32n_tensor_op_f32_sm80.cu
+  herk_cf32h_cf32n_tensor_op_fast_f32_sm80.cu
+
+  ## TRMM
+  # Trmm SM80 f64 tests
+  trmm_f64n_f64n_f64t_tensor_op_f64_ls_sm80.cu
+  trmm_f64n_f64n_f64t_tensor_op_f64_rs_sm80.cu
+  trmm_f64t_f64t_f64n_tensor_op_f64_ls_sm80.cu
+  trmm_f64t_f64t_f64n_tensor_op_f64_rs_sm80.cu
+  trmm_f64n_f64t_f64t_tensor_op_f64_rs_sm80.cu
+
+  # Trmm SM80 f32 tests
+  trmm_tf32t_tf32n_f32t_tensor_op_f32_ls_sm80.cu
+  trmm_tf32n_tf32t_f32t_tensor_op_f32_ls_sm80.cu
+  trmm_tf32n_tf32t_f32t_tensor_op_f32_rs_sm80.cu
+  trmm_tf32t_tf32n_f32n_tensor_op_f32_ls_sm80.cu
+  trmm_f32t_f32n_f32t_tensor_op_fast_f32_ls_sm80.cu
+  trmm_f32n_f32t_f32t_tensor_op_fast_f32_ls_sm80.cu
+  trmm_f32n_f32t_f32t_tensor_op_fast_f32_rs_sm80.cu
+  trmm_f32t_f32n_f32n_tensor_op_fast_f32_ls_sm80.cu
+
+  # Trmm SM80 complex f64 tests
+  trmm_cf64n_cf64n_cf64t_tensor_op_f64_sm80.cu
+  trmm_cf64n_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu
+
+  # Trmm SM80 complex f32 tests
+  trmm_cf32n_cf32n_cf32t_tensor_op_f32_sm80.cu
+  trmm_cf32n_cf32n_cf32t_tensor_op_fast_f32_sm80.cu
+
+  ## SYR2K
+  # Syr2k SM80 f64 tests
+  syr2k_f64n_f64t_tensor_op_f64_sm80.cu
+  syr2k_f64n_f64n_tensor_op_f64_sm80.cu
+  syr2k_f64t_f64n_tensor_op_f64_sm80.cu
+
+  # Syr2k SM80 f32 tests
+  syr2k_tf32n_f32n_tensor_op_f32_sm80.cu
+  syr2k_tf32t_f32n_tensor_op_f32_sm80.cu
+  syr2k_f32n_f32n_tensor_op_fast_f32_sm80.cu
+  syr2k_f32t_f32n_tensor_op_fast_f32_sm80.cu
+
+  # Syr2k SM80 complex f64 tests
+  syr2k_cf64n_cf64t_tensor_op_f64_sm80.cu
+  syr2k_cf64n_cf64n_tensor_op_f64_sm80.cu
+
+  # Syr2k SM80 complex f32 tests
+  syr2k_cf32n_cf32n_tensor_op_f32_sm80.cu
+  syr2k_cf32n_cf32t_tensor_op_f32_sm80.cu
+  syr2k_cf32n_cf32n_tensor_op_fast_f32_sm80.cu
+  syr2k_cf32n_cf32t_tensor_op_fast_f32_sm80.cu
+
+  ## HER2K
+  # Her2k SM80 complex f64 tests
+  her2k_cf64n_cf64n_tensor_op_f64_sm80.cu
+
+  # Her2k SM80 complex f32 tests
+  her2k_cf32h_cf32n_tensor_op_f32_sm80.cu
+  her2k_cf32h_cf32n_tensor_op_fast_f32_sm80.cu
+
+  ## SYMM
+  # Symm SM80 f64 tests
+  symm_f64n_f64n_tensor_op_f64_ls_sm80.cu
+  symm_f64n_f64n_tensor_op_f64_rs_sm80.cu
+  symm_f64n_f64t_tensor_op_f64_ls_sm80.cu
+  symm_f64n_f64t_tensor_op_f64_rs_sm80.cu
+  symm_f64t_f64n_tensor_op_f64_ls_sm80.cu
+  symm_f64t_f64n_tensor_op_f64_rs_sm80.cu
+  symm_f64t_f64t_tensor_op_f64_ls_sm80.cu
+  symm_f64t_f64t_tensor_op_f64_rs_sm80.cu
+
+  # Symm SM80 f32 tests
+  symm_tf32n_f32n_tensor_op_f32_ls_sm80.cu
+  symm_tf32n_f32n_tensor_op_f32_rs_sm80.cu
+  symm_tf32t_f32t_tensor_op_f32_ls_sm80.cu
+  symm_f32n_f32n_tensor_op_fast_f32_ls_sm80.cu
+  symm_f32n_f32n_tensor_op_fast_f32_rs_sm80.cu
+  symm_f32t_f32t_tensor_op_fast_f32_ls_sm80.cu
+
+  # Symm SM80 complex f64 tests
+  symm_cf64n_cf64n_cf64n_tensor_op_ls_f64_sm80.cu
+  symm_cf64n_cf64n_cf64n_tensor_op_rs_f64_sm80.cu
+  symm_cf64n_cf64n_cf64n_tensor_op_ls_f64_gaussian_sm80.cu
+
+  # Symm SM80 complex f32 tests
+  symm_cf32n_cf32n_tensor_op_f32_ls_sm80.cu
+  symm_cf32n_cf32n_tensor_op_f32_rs_sm80.cu
+  symm_cf32n_cf32n_tensor_op_fast_f32_ls_sm80.cu
+  symm_cf32n_cf32n_tensor_op_fast_f32_rs_sm80.cu
+
+  # Hemm SM80 complex f64 tests
+  hemm_cf64h_cf64n_cf64n_tensor_op_ls_f64_sm80.cu
+  hemm_cf64h_cf64n_cf64n_tensor_op_rs_f64_sm80.cu
+  hemm_cf64h_cf64n_cf64n_tensor_op_ls_f64_gaussian_sm80.cu
+  
+  # Hemm SM80 complex f32 tests
+  hemm_cf32h_cf32n_tensor_op_f32_ls_sm80.cu
+  hemm_cf32h_cf32n_tensor_op_f32_rs_sm80.cu
+  hemm_cf32h_cf32n_tensor_op_fast_f32_ls_sm80.cu
+  hemm_cf32h_cf32n_tensor_op_fast_f32_rs_sm80.cu
+  )
+
+endif()
diff --git a/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm75.cu
index 74b5fb12..f960dfcc 100644
--- a/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm75.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -112,6 +118,50 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32n_tensor_op_s32, 128x128x512_64x64x512) {
   EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
 }
 
+TEST(SM75_Device_Gemm_b1t_b1n_s32n_tensor_op_s32, 64x256x512_64x64x512) {
+
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75,
+      cutlass::gemm::GemmShape<64, 256, 512>,
+      cutlass::gemm::GemmShape<64, 64, 512>,
+      cutlass::gemm::GemmShape<8, 8, 128>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128,
+      false, cutlass::arch::OpXorPopc>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM75_Device_Gemm_b1t_b1n_s32n_tensor_op_s32, 256x64x512_64x64x512) {
+
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75,
+      cutlass::gemm::GemmShape<256, 64, 512>,
+      cutlass::gemm::GemmShape<64, 64, 512>,
+      cutlass::gemm::GemmShape<8, 8, 128>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128,
+      false, cutlass::arch::OpXorPopc>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
 TEST(SM75_Device_Gemm_b1t_b1n_s32n_tensor_op_s32, 64x128x512_32x64x512) {
 
   using ElementOutput = int32_t;
diff --git a/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm80.cu
index 107f3982..59e1e367 100644
--- a/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm80.cu
+++ b/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm80.cu
@@ -1,24 +1,30 @@
 /**************************************************************************************************
- Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
-
- Redistribution and use in source and binary forms, with or without modification, are permitted
- provided that the following conditions are met:
-     * Redistributions of source code must retain the above copyright notice, this list of
-       conditions and the following disclaimer.
-     * Redistributions in binary form must reproduce the above copyright notice, this list of
-       conditions and the following disclaimer in the documentation and/or other materials
-       provided with the distribution.
-     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-       to endorse or promote products derived from this software without specific prior written
-       permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_b1t_b1n_s32n_wmma_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_b1t_b1n_s32n_wmma_tensor_op_s32_sm75.cu
index f4f10ad6..e6368203 100644
--- a/test/unit/gemm/device/gemm_b1t_b1n_s32n_wmma_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_b1t_b1n_s32n_wmma_tensor_op_s32_sm75.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm75.cu
index e9afd20b..c1a32d9d 100644
--- a/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm75.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -112,6 +118,49 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32t_tensor_op_s32, 128x128x512_64x64x512) {
   EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
 }
 
+TEST(SM75_Device_Gemm_b1t_b1n_s32t_tensor_op_s32, 64x256x512_64x64x512) {
+
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75,
+      cutlass::gemm::GemmShape<64, 256, 512>,
+      cutlass::gemm::GemmShape<64, 64, 512>,
+      cutlass::gemm::GemmShape<8, 8, 128>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128,
+      false, cutlass::arch::OpXorPopc>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM75_Device_Gemm_b1t_b1n_s32t_tensor_op_s32, 256x64x512_64x64x512) {
+
+  using ElementOutput = int32_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = int32_t;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75,
+      cutlass::gemm::GemmShape<256, 64, 512>,
+      cutlass::gemm::GemmShape<64, 64, 512>,
+      cutlass::gemm::GemmShape<8, 8, 128>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128,
+      false, cutlass::arch::OpXorPopc>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
 TEST(SM75_Device_Gemm_b1t_b1n_s32t_tensor_op_s32, 64x128x512_32x64x512) {
 
   using ElementOutput = int32_t;
diff --git a/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm80.cu
index 1180646f..011399a9 100644
--- a/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm80.cu
+++ b/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm80.cu
@@ -1,24 +1,30 @@
 /**************************************************************************************************
- Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
-
- Redistribution and use in source and binary forms, with or without modification, are permitted
- provided that the following conditions are met:
-     * Redistributions of source code must retain the above copyright notice, this list of
-       conditions and the following disclaimer.
-     * Redistributions in binary form must reproduce the above copyright notice, this list of
-       conditions and the following disclaimer in the documentation and/or other materials
-       provided with the distribution.
-     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-       to endorse or promote products derived from this software without specific prior written
-       permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_b1t_b1n_s32t_wmma_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_b1t_b1n_s32t_wmma_tensor_op_s32_sm75.cu
index 7eb860eb..f359db81 100644
--- a/test/unit/gemm/device/gemm_b1t_b1n_s32t_wmma_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_b1t_b1n_s32t_wmma_tensor_op_s32_sm75.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_bf16n_bf16n_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_bf16n_bf16n_f32t_tensor_op_f32_sm80.cu
index 1d3261c1..d46f9c55 100644
--- a/test/unit/gemm/device/gemm_bf16n_bf16n_f32t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_bf16n_bf16n_f32t_tensor_op_f32_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_bf16t_bf16t_bf16t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_bf16t_bf16t_bf16t_tensor_op_f32_sm80.cu
index 567d27eb..33a4afd2 100644
--- a/test/unit/gemm/device/gemm_bf16t_bf16t_bf16t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_bf16t_bf16t_bf16t_tensor_op_f32_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32_sm80.cu b/test/unit/gemm/device/gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32_sm80.cu
index 72ea5aff..722d0d98 100644
--- a/test/unit/gemm/device/gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32_sm80.cu b/test/unit/gemm/device/gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32_sm80.cu
index 9d2022d7..1b263a4c 100644
--- a/test/unit/gemm/device/gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu b/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu
index 68eb218c..65b68e1a 100644
--- a/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu
+++ b/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu b/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu
index b325ea16..b3eb536a 100644
--- a/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu
+++ b/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu b/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu
index 14e8ff51..7d5b3906 100644
--- a/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu
+++ b/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_sm80.cu b/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_sm80.cu
index 0958f304..d65b7a7e 100644
--- a/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_sm80.cu
+++ b/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16n_direct_store_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16n_direct_store_tensor_op_f32_sm80.cu
index 98ec1baf..f0072ce7 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f16n_direct_store_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f16n_direct_store_tensor_op_f32_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f16_sm70.cu
index 19fb324e..56b11390 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f16_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f16_sm70.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f32_sm70.cu
index b077e7b8..95fe7e54 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f32_sm70.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm75.cu
index 1a9e54b1..809be9f6 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm75.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm75.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm80.cu
index 0640a08b..a54d510e 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sparse_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sparse_sm80.cu
index b70af0a8..2f0674e6 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sparse_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sparse_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16t_volta_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16t_volta_tensor_op_f32_sm70.cu
index 9345b2c3..0061b140 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f16t_volta_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f16t_volta_tensor_op_f32_sm70.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f16_sm70.cu
index cd196932..453f71a4 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f16_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f16_sm70.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f32_sm70.cu
index 6d018ed6..6ccc674e 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f32_sm70.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm75.cu
index 8322f8d0..5b2aecb5 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm75.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm75.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm80.cu
index 8c41e724..e18e647e 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f32n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f32n_wmma_tensor_op_f32_sm70.cu
index 5f5bf9f0..b52e2009 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f32n_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f32n_wmma_tensor_op_f32_sm70.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm75.cu
index da8aaba5..5c484613 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm75.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm75.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm80.cu
index fcc5caa6..de2e6415 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sparse_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sparse_sm80.cu
index bebfa61e..2ef69f31 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sparse_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sparse_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f32t_volta_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f32t_volta_tensor_op_f32_sm70.cu
index 5a750cc5..e17df06f 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f32t_volta_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f32t_volta_tensor_op_f32_sm70.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f32t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f32t_wmma_tensor_op_f32_sm70.cu
index 784da3c0..b2625df8 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f32t_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f32t_wmma_tensor_op_f32_sm70.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f16_sm70.cu
index 4f601696..d86572ea 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f16_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f16_sm70.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f32_sm70.cu
index 7960dc73..9ab17331 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f32_sm70.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm75.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm75.cu
index dc310d77..c8658c65 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm75.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm75.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm80.cu
index dce090c8..dfbe427e 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm75.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm75.cu
index e9f4dd00..aa7f2de8 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm75.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm75.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm80.cu
index a7a4aee6..14cf7977 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sparse_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sparse_sm80.cu
index 89440578..f8ac08b4 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sparse_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sparse_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f32_sm80.cu
index e7f56a81..f6408e6e 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f32_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_volta_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_volta_tensor_op_f16_sm70.cu
index 25e0b046..e50e2033 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f16t_volta_tensor_op_f16_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_volta_tensor_op_f16_sm70.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f16_sm70.cu
index dc46a205..20d3f0fb 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f16_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f16_sm70.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f32_sm70.cu
index 98b6f43f..a67ee9f3 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f32_sm70.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f32n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f32n_wmma_tensor_op_f32_sm70.cu
index 643ad31c..4e298d28 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f32n_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f32n_wmma_tensor_op_f32_sm70.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm75.cu
index 3e7e2eb3..2c5dc566 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm75.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm75.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm80.cu
index c8a46cfe..ed40a285 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sparse_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sparse_sm80.cu
index 71d8f586..4b76da00 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sparse_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sparse_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f32t_volta_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f32t_volta_tensor_op_f32_sm70.cu
index 746067b7..e76965e6 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f32t_volta_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f32t_volta_tensor_op_f32_sm70.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f32t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f32t_wmma_tensor_op_f32_sm70.cu
index 5f48fde0..a66b460d 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f32t_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f32t_wmma_tensor_op_f32_sm70.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16_sm70.cu
index ec05b405..49cba753 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16_sm70.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f16_sm70.cu
index 06f98cd2..e6403449 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f16_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f16_sm70.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f32_sm70.cu
index 0d0dbbce..ca914e10 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f32_sm70.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_singlestage_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_singlestage_wmma_tensor_op_f16_sm70.cu
index 1ab1f601..a9e73b50 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_singlestage_wmma_tensor_op_f16_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_singlestage_wmma_tensor_op_f16_sm70.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm75.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm75.cu
index 02bfec00..3038c236 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm75.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm75.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm80.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm80.cu
index e6419707..078247d3 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm75.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm75.cu
index 30057f65..2486221b 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm75.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm75.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm80.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm80.cu
index 5d9a253a..6532c378 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sparse_sm80.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sparse_sm80.cu
index bf42eb08..02ccb7a6 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sparse_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sparse_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_volta_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_volta_tensor_op_f16_sm70.cu
index 8bfc22be..4be7d5bf 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_volta_tensor_op_f16_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_volta_tensor_op_f16_sm70.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f16_sm70.cu
index 2d3619ce..32403b69 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f16_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f16_sm70.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f32_sm70.cu
index 3db56003..c30f8b41 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f32_sm70.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f32n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f32n_wmma_tensor_op_f32_sm70.cu
index 56d72161..39d66c22 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f32n_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f32n_wmma_tensor_op_f32_sm70.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f32t_singlestage_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f32t_singlestage_wmma_tensor_op_f32_sm70.cu
index 65fda639..0fea4086 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f32t_singlestage_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f32t_singlestage_wmma_tensor_op_f32_sm70.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm75.cu
index f1b7b67f..efdfd0d7 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm75.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm75.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm80.cu
index dad3949f..bd06a66f 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sparse_sm80.cu b/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sparse_sm80.cu
index 46a89505..e5feffd1 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sparse_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sparse_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f32t_volta_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f32t_volta_tensor_op_f32_sm70.cu
index 3c2010ea..9278e32f 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f32t_volta_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f32t_volta_tensor_op_f32_sm70.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f32t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f32t_wmma_tensor_op_f32_sm70.cu
index 3dece4a8..13effa80 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f32t_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f32t_wmma_tensor_op_f32_sm70.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f16_sm70.cu
index 0ba9f58c..4f148baa 100644
--- a/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f16_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f16_sm70.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f32_sm70.cu
index 6ede7fed..0d2c8847 100644
--- a/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f32_sm70.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f16_sm70.cu
index 4fcefb83..ce33b668 100644
--- a/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f16_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f16_sm70.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f32_sm70.cu
index 3abeae37..9410d89e 100644
--- a/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f32_sm70.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm75.cu
index 28a63ec4..54278536 100644
--- a/test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm75.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm75.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm80.cu
index 9a4e7752..dded0e81 100644
--- a/test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f32n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16t_f32n_wmma_tensor_op_f32_sm70.cu
index 1f4c4606..97add72d 100644
--- a/test/unit/gemm/device/gemm_f16t_f16t_f32n_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16t_f32n_wmma_tensor_op_f32_sm70.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm75.cu
index 701e9ec6..66a573de 100644
--- a/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm75.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm75.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm80.cu
index 43936067..8bf17302 100644
--- a/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sparse_sm80.cu b/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sparse_sm80.cu
index a8975a4a..7428f94b 100644
--- a/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sparse_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sparse_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f32t_volta_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16t_f32t_volta_tensor_op_f32_sm70.cu
index ba4ed527..830f6baf 100644
--- a/test/unit/gemm/device/gemm_f16t_f16t_f32t_volta_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16t_f32t_volta_tensor_op_f32_sm70.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f32t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16t_f32t_wmma_tensor_op_f32_sm70.cu
index 46f862b1..7f424946 100644
--- a/test/unit/gemm/device/gemm_f16t_f16t_f32t_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16t_f32t_wmma_tensor_op_f32_sm70.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_bf16_f32_sm80.cu b/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_bf16_f32_sm80.cu
index 0242eb11..7c3fa570 100644
--- a/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_bf16_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_bf16_f32_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_f32_sm80.cu
index 3858562d..18e8bc32 100644
--- a/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_f32_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_f32_sparse_sm80.cu b/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_f32_sparse_sm80.cu
index e3d6d3d0..707025c9 100644
--- a/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_f32_sparse_sm80.cu
+++ b/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_f32_sparse_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f32n_f32t_f32t_tensor_op_f32_sparse_sm80.cu b/test/unit/gemm/device/gemm_f32n_f32t_f32t_tensor_op_f32_sparse_sm80.cu
index 1fe32cb0..424d1bac 100644
--- a/test/unit/gemm/device/gemm_f32n_f32t_f32t_tensor_op_f32_sparse_sm80.cu
+++ b/test/unit/gemm/device/gemm_f32n_f32t_f32t_tensor_op_f32_sparse_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f32t_f32n_f32t_tensor_op_f32_sparse_sm80.cu b/test/unit/gemm/device/gemm_f32t_f32n_f32t_tensor_op_f32_sparse_sm80.cu
index f29d4c99..5ccd061f 100644
--- a/test/unit/gemm/device/gemm_f32t_f32n_f32t_tensor_op_f32_sparse_sm80.cu
+++ b/test/unit/gemm/device/gemm_f32t_f32n_f32t_tensor_op_f32_sparse_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f32t_f32t_f32t_tensor_op_f32_sparse_sm80.cu b/test/unit/gemm/device/gemm_f32t_f32t_f32t_tensor_op_f32_sparse_sm80.cu
index f76fddbf..cf184ec6 100644
--- a/test/unit/gemm/device/gemm_f32t_f32t_f32t_tensor_op_f32_sparse_sm80.cu
+++ b/test/unit/gemm/device/gemm_f32t_f32t_f32t_tensor_op_f32_sparse_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f64n_f64t_f64t_tensor_op_f64_sm80.cu b/test/unit/gemm/device/gemm_f64n_f64t_f64t_tensor_op_f64_sm80.cu
index e2e24f47..683cf43c 100644
--- a/test/unit/gemm/device/gemm_f64n_f64t_f64t_tensor_op_f64_sm80.cu
+++ b/test/unit/gemm/device/gemm_f64n_f64t_f64t_tensor_op_f64_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_f64t_f64n_f64t_tensor_op_f64_sm80.cu b/test/unit/gemm/device/gemm_f64t_f64n_f64t_tensor_op_f64_sm80.cu
index c187804b..eb6f1984 100644
--- a/test/unit/gemm/device/gemm_f64t_f64n_f64t_tensor_op_f64_sm80.cu
+++ b/test/unit/gemm/device/gemm_f64t_f64n_f64t_tensor_op_f64_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_grouped_sm80.cu b/test/unit/gemm/device/gemm_grouped_sm80.cu
index 609732c4..921161e4 100644
--- a/test/unit/gemm/device/gemm_grouped_sm80.cu
+++ b/test/unit/gemm/device/gemm_grouped_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -339,6 +345,47 @@ TEST(SM80_Device_GemmGrouped_f16n_f16t_f32n_tensor_op_f32, 128x128x32_64x64x32)
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+TEST(SM80_Device_GemmGrouped_f16n_f16t_f32t_tensor_op_f32, 128x128x32_64x64x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using GemmKernel = typename cutlass::gemm::kernel::DefaultGemmGrouped<
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    cutlass::ComplexTransform::kNone,
+    8,
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    cutlass::ComplexTransform::kNone,
+    8,
+    ElementOutput, cutlass::layout::RowMajor,    // row major
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+        ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+        ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle,
+    3>::GemmKernel;
+
+  using Gemm = cutlass::gemm::device::GemmGrouped<GemmKernel>;
+
+  //
+  // Test
+  //
+
+  test::gemm::device::TestbedGrouped<Gemm> testbed;
+
+  bool passed = testbed.run(24);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
 TEST(SM80_Device_GemmGrouped_f16t_f16n_f32n_tensor_op_f32, 128x64x32_64x32x32) {
 
   using ElementOutput = cutlass::half_t;
@@ -353,7 +400,7 @@ TEST(SM80_Device_GemmGrouped_f16t_f16n_f32n_tensor_op_f32, 128x64x32_64x32x32) {
     cutlass::layout::ColumnMajor, 
     cutlass::ComplexTransform::kNone,
     8,
-    ElementOutput, cutlass::layout::RowMajor,
+    ElementOutput, cutlass::layout::ColumnMajor,
     ElementAccumulator, 
     cutlass::arch::OpClassTensorOp, 
     cutlass::arch::Sm80,
@@ -380,6 +427,47 @@ TEST(SM80_Device_GemmGrouped_f16t_f16n_f32n_tensor_op_f32, 128x64x32_64x32x32) {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+TEST(SM80_Device_GemmGrouped_f16t_f16n_f32t_tensor_op_f32, 128x64x32_64x32x32) {
+
+  using ElementOutput = cutlass::half_t;
+  using ElementAccumulator = float;
+
+  using GemmKernel = typename cutlass::gemm::kernel::DefaultGemmGrouped<
+    cutlass::half_t,
+    cutlass::layout::RowMajor,
+    cutlass::ComplexTransform::kNone,
+    8,
+    cutlass::half_t,
+    cutlass::layout::ColumnMajor,
+    cutlass::ComplexTransform::kNone,
+    8,
+    ElementOutput, cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 32>,
+    cutlass::gemm::GemmShape<64, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, 16>,
+    cutlass::epilogue::thread::LinearCombination<
+        ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+        ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle,
+    4>::GemmKernel;
+
+  using Gemm = cutlass::gemm::device::GemmGrouped<GemmKernel>;
+
+  //
+  // Test
+  //
+
+  test::gemm::device::TestbedGrouped<Gemm> testbed;
+
+  bool passed = testbed.run(27);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
 TEST(SM80_Device_GemmGrouped_f64t_f64t_f64n_tensor_op_f64, 64x64x16_32x32x16) {
 
   using ElementInput = double;
@@ -464,6 +552,132 @@ TEST(SM80_Device_GemmGrouped_f32t_f32t_f32n_simt_f32, 128x128x8_64x32x1) {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+TEST(SM80_Device_GemmGrouped_f32t_f32t_f32t_simt_f32, 128x128x8_64x32x1) {
+
+  using ElementInput = float;
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using GemmKernel = typename cutlass::gemm::kernel::DefaultGemmGrouped<
+    ElementInput,
+    cutlass::layout::RowMajor,
+    cutlass::ComplexTransform::kNone,
+    1,
+    ElementInput,
+    cutlass::layout::RowMajor,
+    cutlass::ComplexTransform::kNone,
+    1,
+    ElementOutput, cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 8>,
+    cutlass::gemm::GemmShape<64, 32, 8>,
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+        ElementOutput, 1,
+        ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle,
+    3>::GemmKernel;
+
+  using Gemm = cutlass::gemm::device::GemmGrouped<GemmKernel>;
+
+  //
+  // Test
+  //
+
+  test::gemm::device::TestbedGrouped<Gemm> testbed;
+
+  bool passed = testbed.run(27);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_GemmGrouped_f32t_f32t_f32n_simt_f32, 128x64x8_64x32x1) {
+
+  using ElementInput = float;
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using GemmKernel = typename cutlass::gemm::kernel::DefaultGemmGrouped<
+    ElementInput,
+    cutlass::layout::RowMajor,
+    cutlass::ComplexTransform::kNone,
+    1,
+    ElementInput,
+    cutlass::layout::RowMajor,
+    cutlass::ComplexTransform::kNone,
+    1,
+    ElementOutput, cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 8>,
+    cutlass::gemm::GemmShape<64, 32, 8>,
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+        ElementOutput, 1,
+        ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle,
+    3>::GemmKernel;
+
+  using Gemm = cutlass::gemm::device::GemmGrouped<GemmKernel>;
+
+  //
+  // Test
+  //
+
+  test::gemm::device::TestbedGrouped<Gemm> testbed;
+
+  bool passed = testbed.run(27);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_GemmGrouped_f32t_f32t_f32t_simt_f32, 128x64x8_64x32x1) {
+
+  using ElementInput = float;
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using GemmKernel = typename cutlass::gemm::kernel::DefaultGemmGrouped<
+    ElementInput,
+    cutlass::layout::RowMajor,
+    cutlass::ComplexTransform::kNone,
+    1,
+    ElementInput,
+    cutlass::layout::RowMajor,
+    cutlass::ComplexTransform::kNone,
+    1,
+    ElementOutput, cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassSimt,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 8>,
+    cutlass::gemm::GemmShape<64, 32, 8>,
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    cutlass::epilogue::thread::LinearCombination<
+        ElementOutput, 1,
+        ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle,
+    3>::GemmKernel;
+
+  using Gemm = cutlass::gemm::device::GemmGrouped<GemmKernel>;
+
+  //
+  // Test
+  //
+
+  test::gemm::device::TestbedGrouped<Gemm> testbed;
+
+  bool passed = testbed.run(27);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
 TEST(SM80_Device_GemmGrouped_cf32n_cf32n_cf32n_tensorop_f32, 64x64x16_32x32x16) {
 
   using ElementInput = cutlass::complex<float>;
@@ -507,7 +721,7 @@ TEST(SM80_Device_GemmGrouped_cf32n_cf32n_cf32n_tensorop_f32, 64x64x16_32x32x16)
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-TEST(SM80_Device_GemmGrouped_cf32c_cf32t_cf32c_tensorop_f32, 64x64x16_32x32x16) {
+TEST(SM80_Device_GemmGrouped_cf32c_cf32t_cf32n_tensorop_f32, 64x64x16_32x32x16) {
 
   using ElementInput = cutlass::complex<float>;
   using ElementOutput = cutlass::complex<float>;
@@ -550,6 +764,49 @@ TEST(SM80_Device_GemmGrouped_cf32c_cf32t_cf32c_tensorop_f32, 64x64x16_32x32x16)
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+TEST(SM80_Device_GemmGrouped_cf32c_cf32t_cf32t_tensorop_f32, 64x64x16_32x32x16) {
+
+  using ElementInput = cutlass::complex<float>;
+  using ElementOutput = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using GemmKernel = typename cutlass::gemm::kernel::DefaultGemmGrouped<
+    ElementInput,
+    cutlass::layout::ColumnMajor,
+    cutlass::ComplexTransform::kConjugate,
+    1,
+    ElementInput,
+    cutlass::layout::ColumnMajor,
+    cutlass::ComplexTransform::kConjugate,
+    1,
+    ElementOutput, cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+        ElementOutput, 1,
+        ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle,
+    3,
+    cutlass::arch::OpMultiplyAddComplex>::GemmKernel;
+
+  using Gemm = cutlass::gemm::device::GemmGrouped<GemmKernel>;
+
+  //
+  // Test
+  //
+
+  test::gemm::device::TestbedGrouped<Gemm> testbed;
+
+  bool passed = testbed.run(27);
+  EXPECT_TRUE(passed);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
 TEST(SM80_Device_GemmGrouped_cf32t_cf32h_cf32n_tensorop_f32, 64x64x16_16x16x16) {
 
   using ElementInput = cutlass::complex<double>;
diff --git a/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm70.cu b/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm70.cu
index 85bcc671..b2392cdf 100644
--- a/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm70.cu
+++ b/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm70.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm75.cu b/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm75.cu
index 22dcc43b..93b0569f 100644
--- a/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm75.cu
+++ b/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm75.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm80.cu b/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm80.cu
index 41977eac..c6ea1792 100644
--- a/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm80.cu
+++ b/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm75.cu
index ff070dae..05942198 100644
--- a/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm75.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm80.cu
index 30cb5d56..e32c513d 100644
--- a/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm80.cu
+++ b/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm75.cu
index 56d1ffc1..71f0f790 100644
--- a/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm75.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm80.cu
index 644ca8e7..48aad314 100644
--- a/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm80.cu
+++ b/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm80.cu
@@ -1,24 +1,30 @@
 /**************************************************************************************************
- Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
-
- Redistribution and use in source and binary forms, with or without modification, are permitted
- provided that the following conditions are met:
-     * Redistributions of source code must retain the above copyright notice, this list of
-       conditions and the following disclaimer.
-     * Redistributions in binary form must reproduce the above copyright notice, this list of
-       conditions and the following disclaimer in the documentation and/or other materials
-       provided with the distribution.
-     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-       to endorse or promote products derived from this software without specific prior written
-       permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s32n_wmma_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4t_s4n_s32n_wmma_tensor_op_s32_sm75.cu
index c6afebc1..d99f6f31 100644
--- a/test/unit/gemm/device/gemm_s4t_s4n_s32n_wmma_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s4t_s4n_s32n_wmma_tensor_op_s32_sm75.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm75.cu
index bf084b39..df9674b3 100644
--- a/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm75.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm80.cu
index b50c2b80..96f68eb5 100644
--- a/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm80.cu
+++ b/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm80.cu
@@ -1,24 +1,30 @@
 /**************************************************************************************************
- Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
-
- Redistribution and use in source and binary forms, with or without modification, are permitted
- provided that the following conditions are met:
-     * Redistributions of source code must retain the above copyright notice, this list of
-       conditions and the following disclaimer.
-     * Redistributions in binary form must reproduce the above copyright notice, this list of
-       conditions and the following disclaimer in the documentation and/or other materials
-       provided with the distribution.
-     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-       to endorse or promote products derived from this software without specific prior written
-       permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sparse_sm80.cu b/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sparse_sm80.cu
index ff40a868..890cbe39 100644
--- a/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sparse_sm80.cu
+++ b/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sparse_sm80.cu
@@ -1,24 +1,30 @@
 /**************************************************************************************************
- Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
-
- Redistribution and use in source and binary forms, with or without modification, are permitted
- provided that the following conditions are met:
-     * Redistributions of source code must retain the above copyright notice, this list of
-       conditions and the following disclaimer.
-     * Redistributions in binary form must reproduce the above copyright notice, this list of
-       conditions and the following disclaimer in the documentation and/or other materials
-       provided with the distribution.
-     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-       to endorse or promote products derived from this software without specific prior written
-       permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s32t_wmma_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4t_s4n_s32t_wmma_tensor_op_s32_sm75.cu
index 65659283..d391a0bd 100644
--- a/test/unit/gemm/device/gemm_s4t_s4n_s32t_wmma_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s4t_s4n_s32t_wmma_tensor_op_s32_sm75.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s4n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4t_s4n_s4n_tensor_op_s32_sm75.cu
index 2e6df950..9f0b16a1 100644
--- a/test/unit/gemm/device/gemm_s4t_s4n_s4n_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s4t_s4n_s4n_tensor_op_s32_sm75.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -142,6 +148,70 @@ TEST(SM75_Device_Gemm_s4t_s4n_s4n_tensor_op_s32, 128x128x128_64x64x128) {
   EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
 }
 
+TEST(SM75_Device_Gemm_s4t_s4n_s4n_tensor_op_s32, 64x256x128_64x64x128) {
+
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::int4b_t,
+    cutlass::layout::RowMajor,
+    cutlass::int4b_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<64, 256, 128>,
+    cutlass::gemm::GemmShape<64, 64, 128>,
+    cutlass::gemm::GemmShape<8, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM75_Device_Gemm_s4t_s4n_s4n_tensor_op_s32, 256x64x128_64x64x128) {
+
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::int4b_t,
+    cutlass::layout::RowMajor,
+    cutlass::int4b_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<256, 64, 128>,
+    cutlass::gemm::GemmShape<64, 64, 128>,
+    cutlass::gemm::GemmShape<8, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      32 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
 TEST(SM75_Device_Gemm_s4t_s4n_s4n_tensor_op_s32, 64x128x128_32x64x128) {
 
   using ElementOutput = cutlass::int4b_t;
@@ -239,5 +309,4 @@ TEST(SM75_Device_Gemm_s4t_s4n_s4n_tensor_op_s32, 64x64x128_32x32x128) {
 }
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
-
 #endif
diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s4n_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s4t_s4n_s4n_tensor_op_s32_sm80.cu
new file mode 100644
index 00000000..29c201a4
--- /dev/null
+++ b/test/unit/gemm/device/gemm_s4t_s4n_s4n_tensor_op_s32_sm80.cu
@@ -0,0 +1,374 @@
+/**************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+#include "multistage_testbed.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#if (CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s4t_s4n_s4n_tensor_op_s32, 128x256x256_64x64x256, {
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, cutlass::layout::ColumnMajor,
+      ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 256, 256>,
+      cutlass::gemm::GemmShape<64, 64, 256>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 64 / cutlass::sizeof_bits<ElementOutput>::value, ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s4t_s4n_s4n_tensor_op_s32, 256x128x256_64x64x256, {
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, cutlass::layout::ColumnMajor,
+      ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 128, 256>,
+      cutlass::gemm::GemmShape<64, 64, 256>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 64 / cutlass::sizeof_bits<ElementOutput>::value, ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s4t_s4n_s4n_tensor_op_s32, 128x128x256_64x64x256, {
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, cutlass::layout::ColumnMajor,
+      ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 128, 256>,
+      cutlass::gemm::GemmShape<64, 64, 256>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 64 / cutlass::sizeof_bits<ElementOutput>::value, ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s4t_s4n_s4n_tensor_op_s32, 256x64x256_64x64x256, {
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, cutlass::layout::ColumnMajor,
+      ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 64, 256>,
+      cutlass::gemm::GemmShape<64, 64, 256>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 32 / cutlass::sizeof_bits<ElementOutput>::value, ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s4t_s4n_s4n_tensor_op_s32, 64x256x256_64x64x256, {
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 256, 256>,
+      cutlass::gemm::GemmShape<64, 64, 256>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 64 / cutlass::sizeof_bits<ElementOutput>::value, ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s4t_s4n_s4n_tensor_op_s32, 64x128x256_32x64x256, {
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, cutlass::layout::ColumnMajor,
+      ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 128, 256>,
+      cutlass::gemm::GemmShape<32, 64, 256>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 64 / cutlass::sizeof_bits<ElementOutput>::value, ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s4t_s4n_s4n_tensor_op_s32, 128x64x256_64x32x256, {
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, cutlass::layout::ColumnMajor,
+      ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 64, 256>,
+      cutlass::gemm::GemmShape<64, 32, 256>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 32 / cutlass::sizeof_bits<ElementOutput>::value, ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s4t_s4n_s4n_tensor_op_s32, 64x64x256_32x32x256, {
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, cutlass::layout::ColumnMajor,
+      ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 64, 256>,
+      cutlass::gemm::GemmShape<32, 32, 256>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 32 / cutlass::sizeof_bits<ElementOutput>::value, ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s4t_s4n_s4n_tensor_op_s32, 128x256x128_64x64x128, {
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 256, 128>,
+      cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 64 / cutlass::sizeof_bits<ElementOutput>::value, ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s4t_s4n_s4n_tensor_op_s32, 256x128x128_64x64x128, {
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 128, 128>,
+      cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 64 / cutlass::sizeof_bits<ElementOutput>::value, ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s4t_s4n_s4n_tensor_op_s32, 128x128x128_64x64x128, {
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 128, 128>,
+      cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 64 / cutlass::sizeof_bits<ElementOutput>::value, ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s4t_s4n_s4n_tensor_op_s32, 256x64x128_64x64x128, {
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, cutlass::layout::ColumnMajor,
+      ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 64, 128>,
+      cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 32 / cutlass::sizeof_bits<ElementOutput>::value, ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s4t_s4n_s4n_tensor_op_s32, 64x256x128_64x64x128, {
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, cutlass::layout::ColumnMajor,
+      ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 256, 128>,
+      cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 64 / cutlass::sizeof_bits<ElementOutput>::value, ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s4t_s4n_s4n_tensor_op_s32, 64x128x128_32x64x128, {
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, cutlass::layout::ColumnMajor,
+      ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 128, 128>,
+      cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 64 / cutlass::sizeof_bits<ElementOutput>::value, ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s4t_s4n_s4n_tensor_op_s32, 128x64x128_64x32x128, {
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, cutlass::layout::ColumnMajor,
+      ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 64, 128>,
+      cutlass::gemm::GemmShape<64, 32, 128>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 32 / cutlass::sizeof_bits<ElementOutput>::value, ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s4t_s4n_s4n_tensor_op_s32, 64x64x128_32x32x128, {
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, cutlass::layout::ColumnMajor,
+      ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 64, 128>,
+      cutlass::gemm::GemmShape<32, 32, 128>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 32 / cutlass::sizeof_bits<ElementOutput>::value, ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+////////////////////////////////////////////////////////////////////////////////
+#endif // #if (CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s4t_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4t_s4n_s4t_tensor_op_s32_sm75.cu
index 3604ebe5..1a405b7a 100644
--- a/test/unit/gemm/device/gemm_s4t_s4n_s4t_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s4t_s4n_s4t_tensor_op_s32_sm75.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -71,7 +77,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 128x256x128_64x64x128) {
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -103,7 +109,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 256x128x128_64x64x128) {
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -135,13 +141,76 @@ TEST(SM75_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 128x128x128_64x64x128) {
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
+
+TEST(SM75_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 64x256x128_64x64x128) {
+
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::int4b_t,
+    cutlass::layout::RowMajor,
+    cutlass::int4b_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<64, 256, 128>,
+    cutlass::gemm::GemmShape<64, 64, 128>,
+    cutlass::gemm::GemmShape<8, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      64 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
   EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
 }
 
+TEST(SM75_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 256x64x128_64x64x128) {
+
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::int4b_t,
+    cutlass::layout::RowMajor,
+    cutlass::int4b_t,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm75,
+    cutlass::gemm::GemmShape<256, 64, 128>,
+    cutlass::gemm::GemmShape<64, 64, 128>,
+    cutlass::gemm::GemmShape<8, 8, 32>,
+    cutlass::epilogue::thread::LinearCombinationClamp<
+      ElementOutput,
+      32 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    2
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+}
 TEST(SM75_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 64x128x128_32x64x128) {
 
   using ElementOutput = cutlass::int4b_t;
@@ -167,7 +236,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 64x128x128_32x64x128) {
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -199,7 +268,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 128x64x128_64x32x128) {
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
@@ -231,7 +300,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 64x64x128_32x32x128) {
       ElementAccumulator,
       ElementCompute
     >,
-    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
     2
   >;
 
diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s4t_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s4t_s4n_s4t_tensor_op_s32_sm80.cu
new file mode 100644
index 00000000..da4549e6
--- /dev/null
+++ b/test/unit/gemm/device/gemm_s4t_s4n_s4t_tensor_op_s32_sm80.cu
@@ -0,0 +1,374 @@
+/**************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/device/gemm.h"
+#include "multistage_testbed.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#if (CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 128x256x256_64x64x256, {
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, cutlass::layout::ColumnMajor,
+      ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 256, 256>,
+      cutlass::gemm::GemmShape<64, 64, 256>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 64 / cutlass::sizeof_bits<ElementOutput>::value, ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 256x128x256_64x64x256, {
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, cutlass::layout::ColumnMajor,
+      ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 128, 256>,
+      cutlass::gemm::GemmShape<64, 64, 256>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 64 / cutlass::sizeof_bits<ElementOutput>::value, ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 128x128x256_64x64x256, {
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, cutlass::layout::ColumnMajor,
+      ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 128, 256>,
+      cutlass::gemm::GemmShape<64, 64, 256>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 64 / cutlass::sizeof_bits<ElementOutput>::value, ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 256x64x256_64x64x256, {
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, cutlass::layout::ColumnMajor,
+      ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 64, 256>,
+      cutlass::gemm::GemmShape<64, 64, 256>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 32 / cutlass::sizeof_bits<ElementOutput>::value, ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 64x256x256_64x64x256, {
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 256, 256>,
+      cutlass::gemm::GemmShape<64, 64, 256>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 64 / cutlass::sizeof_bits<ElementOutput>::value, ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 64x128x256_32x64x256, {
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, cutlass::layout::ColumnMajor,
+      ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 128, 256>,
+      cutlass::gemm::GemmShape<32, 64, 256>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 64 / cutlass::sizeof_bits<ElementOutput>::value, ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 128x64x256_64x32x256, {
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, cutlass::layout::ColumnMajor,
+      ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 64, 256>,
+      cutlass::gemm::GemmShape<64, 32, 256>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 32 / cutlass::sizeof_bits<ElementOutput>::value, ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 64x64x256_32x32x256, {
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, cutlass::layout::ColumnMajor,
+      ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 64, 256>,
+      cutlass::gemm::GemmShape<32, 32, 256>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 32 / cutlass::sizeof_bits<ElementOutput>::value, ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 128x256x128_64x64x128, {
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 256, 128>,
+      cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 64 / cutlass::sizeof_bits<ElementOutput>::value, ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 256x128x128_64x64x128, {
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 128, 128>,
+      cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 64 / cutlass::sizeof_bits<ElementOutput>::value, ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 128x128x128_64x64x128, {
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t,
+      cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 128, 128>,
+      cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 64 / cutlass::sizeof_bits<ElementOutput>::value, ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 256x64x128_64x64x128, {
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, cutlass::layout::ColumnMajor,
+      ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<256, 64, 128>,
+      cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 32 / cutlass::sizeof_bits<ElementOutput>::value, ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 64x256x128_64x64x128, {
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, cutlass::layout::ColumnMajor,
+      ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 256, 128>,
+      cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 64 / cutlass::sizeof_bits<ElementOutput>::value, ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 64x128x128_32x64x128, {
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, cutlass::layout::ColumnMajor,
+      ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 128, 128>,
+      cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 64 / cutlass::sizeof_bits<ElementOutput>::value, ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 128x64x128_64x32x128, {
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, cutlass::layout::ColumnMajor,
+      ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 64, 128>,
+      cutlass::gemm::GemmShape<64, 32, 128>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 32 / cutlass::sizeof_bits<ElementOutput>::value, ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+CUTLASS_TEST_L0(SM80_Device_Gemm_s4t_s4n_s4t_tensor_op_s32, 64x64x128_32x32x128, {
+  using ElementOutput = cutlass::int4b_t;
+  using ElementAccumulator = int32_t;
+  using ElementCompute = float;
+
+  using Gemm = cutlass::gemm::device::Gemm<
+      cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, cutlass::layout::ColumnMajor,
+      ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator,
+      cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<64, 64, 128>,
+      cutlass::gemm::GemmShape<32, 32, 128>, cutlass::gemm::GemmShape<16, 8, 64>,
+      cutlass::epilogue::thread::LinearCombinationClamp<
+          ElementOutput, 32 / cutlass::sizeof_bits<ElementOutput>::value, ElementAccumulator, ElementCompute>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>;
+
+  test::gemm::device::MultistageTestbed<Gemm> testbed;
+
+  EXPECT_TRUE(testbed.run_all());
+} )
+
+////////////////////////////////////////////////////////////////////////////////
+#endif // #if (CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
diff --git a/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm75.cu
index 023e64db..7f0a6b10 100644
--- a/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm75.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm80.cu
index 9df00e23..e33b2ed1 100644
--- a/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm80.cu
+++ b/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm75.cu
index 622b0824..098f8619 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm75.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm80.cu
index 516d9e5f..ad8e9ff9 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm80.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm80.cu
@@ -1,24 +1,30 @@
 /**************************************************************************************************
- Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
-
- Redistribution and use in source and binary forms, with or without modification, are permitted
- provided that the following conditions are met:
-     * Redistributions of source code must retain the above copyright notice, this list of
-       conditions and the following disclaimer.
-     * Redistributions in binary form must reproduce the above copyright notice, this list of
-       conditions and the following disclaimer in the documentation and/or other materials
-       provided with the distribution.
-     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-       to endorse or promote products derived from this software without specific prior written
-       permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s32n_wmma_tensor_op_s32_sm72.cu b/test/unit/gemm/device/gemm_s8t_s8n_s32n_wmma_tensor_op_s32_sm72.cu
index 6a061eed..6f7bfe65 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s32n_wmma_tensor_op_s32_sm72.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s32n_wmma_tensor_op_s32_sm72.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm75.cu
index 7f80cafc..e5279f35 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm75.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm80.cu
index 30749363..f1577be3 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm80.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm80.cu
@@ -1,24 +1,30 @@
 /**************************************************************************************************
- Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
-
- Redistribution and use in source and binary forms, with or without modification, are permitted
- provided that the following conditions are met:
-     * Redistributions of source code must retain the above copyright notice, this list of
-       conditions and the following disclaimer.
-     * Redistributions in binary form must reproduce the above copyright notice, this list of
-       conditions and the following disclaimer in the documentation and/or other materials
-       provided with the distribution.
-     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-       to endorse or promote products derived from this software without specific prior written
-       permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sparse_sm80.cu b/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sparse_sm80.cu
index 69121440..1f63393b 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sparse_sm80.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sparse_sm80.cu
@@ -1,24 +1,30 @@
 /**************************************************************************************************
- Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
-
- Redistribution and use in source and binary forms, with or without modification, are permitted
- provided that the following conditions are met:
-     * Redistributions of source code must retain the above copyright notice, this list of
-       conditions and the following disclaimer.
-     * Redistributions in binary form must reproduce the above copyright notice, this list of
-       conditions and the following disclaimer in the documentation and/or other materials
-       provided with the distribution.
-     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-       to endorse or promote products derived from this software without specific prior written
-       permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s32t_wmma_tensor_op_s32_sm72.cu b/test/unit/gemm/device/gemm_s8t_s8n_s32t_wmma_tensor_op_s32_sm72.cu
index bb4ae6c3..9a49fe96 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s32t_wmma_tensor_op_s32_sm72.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s32t_wmma_tensor_op_s32_sm72.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm75.cu
index 585ac0c3..c95e3f95 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm75.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -142,7 +148,7 @@ CUTLASS_TEST_L0(SM75_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 128x64x64_64x32x64,
     cutlass::gemm::GemmShape<8, 8, 16>,
     cutlass::epilogue::thread::LinearCombinationClamp<
       ElementOutput,
-      32 / cutlass::sizeof_bits<ElementOutput>::value,
+      64 / cutlass::sizeof_bits<ElementOutput>::value,
       ElementAccumulator,
       ElementCompute
     >,
@@ -175,7 +181,7 @@ CUTLASS_TEST_L0(SM75_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 64x64x64_32x32x64, {
     cutlass::gemm::GemmShape<8, 8, 16>,
     cutlass::epilogue::thread::LinearCombinationClamp<
       ElementOutput,
-      32 / cutlass::sizeof_bits<ElementOutput>::value,
+      64 / cutlass::sizeof_bits<ElementOutput>::value,
       ElementAccumulator,
       ElementCompute
     >,
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm80.cu
index 7521c311..438eaf43 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm80.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm80.cu
@@ -1,24 +1,30 @@
 /**************************************************************************************************
- Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
-
- Redistribution and use in source and binary forms, with or without modification, are permitted
- provided that the following conditions are met:
-     * Redistributions of source code must retain the above copyright notice, this list of
-       conditions and the following disclaimer.
-     * Redistributions in binary form must reproduce the above copyright notice, this list of
-       conditions and the following disclaimer in the documentation and/or other materials
-       provided with the distribution.
-     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-       to endorse or promote products derived from this software without specific prior written
-       permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s8n_wmma_tensor_op_s32_sm72.cu b/test/unit/gemm/device/gemm_s8t_s8n_s8n_wmma_tensor_op_s32_sm72.cu
index 4928d2ad..95fa210d 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s8n_wmma_tensor_op_s32_sm72.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s8n_wmma_tensor_op_s32_sm72.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm75.cu
index 29506531..fdd40d63 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm75.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -131,7 +137,7 @@ CUTLASS_TEST_L0(SM75_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 128x64x64_64x32x64,
       cutlass::gemm::GemmShape<128, 64, 64>,
       cutlass::gemm::GemmShape<64, 32, 64>, cutlass::gemm::GemmShape<8, 8, 16>,
       cutlass::epilogue::thread::LinearCombinationClamp<
-          ElementOutput, 32 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementOutput, 64 / cutlass::sizeof_bits<ElementOutput>::value,
           ElementAccumulator, ElementCompute>,
       cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>;
 
@@ -152,7 +158,7 @@ CUTLASS_TEST_L0(SM75_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 64x64x64_32x32x64, {
       cutlass::gemm::GemmShape<64, 64, 64>,
       cutlass::gemm::GemmShape<32, 32, 64>, cutlass::gemm::GemmShape<8, 8, 16>,
       cutlass::epilogue::thread::LinearCombinationClamp<
-          ElementOutput, 32 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementOutput, 64 / cutlass::sizeof_bits<ElementOutput>::value,
           ElementAccumulator, ElementCompute>,
       cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>;
 
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm80.cu
index d46a485c..b886d6c2 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm80.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm80.cu
@@ -1,24 +1,30 @@
 /**************************************************************************************************
- Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
-
- Redistribution and use in source and binary forms, with or without modification, are permitted
- provided that the following conditions are met:
-     * Redistributions of source code must retain the above copyright notice, this list of
-       conditions and the following disclaimer.
-     * Redistributions in binary form must reproduce the above copyright notice, this list of
-       conditions and the following disclaimer in the documentation and/or other materials
-       provided with the distribution.
-     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-       to endorse or promote products derived from this software without specific prior written
-       permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s8t_wmma_tensor_op_s32_sm72.cu b/test/unit/gemm/device/gemm_s8t_s8n_s8t_wmma_tensor_op_s32_sm72.cu
index 3cc32d92..76979c18 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s8t_wmma_tensor_op_s32_sm72.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s8t_wmma_tensor_op_s32_sm72.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_splitk_serial_tensor_op_sm75.cu b/test/unit/gemm/device/gemm_splitk_serial_tensor_op_sm75.cu
index ec3444e5..cc46e547 100644
--- a/test/unit/gemm/device/gemm_splitk_serial_tensor_op_sm75.cu
+++ b/test/unit/gemm/device/gemm_splitk_serial_tensor_op_sm75.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -86,7 +92,7 @@ TEST(SM75_Device_GemmSplitKSerial_f16n_f16n_f16t_tensor_op_f32, 128x256x32_64x64
     cutlass::gemm::GemmShape<128, 256, 32>,
     cutlass::gemm::GemmShape<64, 64, 32>,
     cutlass::gemm::GemmShape<16, 8, 8>,
-    cutlass::epilogue::thread::LinearCombination<
+    cutlass::epilogue::thread::LinearCombinationRelu<
       ElementOutput,
       128 / cutlass::sizeof_bits<ElementOutput>::value,
       ElementAccumulator,
@@ -99,7 +105,8 @@ TEST(SM75_Device_GemmSplitKSerial_f16n_f16n_f16t_tensor_op_f32, 128x256x32_64x64
     kSplitKSerial
   >;
 
-  EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());
+  bool result = test::gemm::device::TestAllGemm<Gemm, true>();
+  EXPECT_TRUE(result);
 }
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/gemm_splitk_simt_sm50.cu b/test/unit/gemm/device/gemm_splitk_simt_sm50.cu
index 6d5820f7..01b29b2c 100644
--- a/test/unit/gemm/device/gemm_splitk_simt_sm50.cu
+++ b/test/unit/gemm/device/gemm_splitk_simt_sm50.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_splitk_tensor_op_sm70.cu b/test/unit/gemm/device/gemm_splitk_tensor_op_sm70.cu
index ce13d8db..284532ad 100644
--- a/test/unit/gemm/device/gemm_splitk_tensor_op_sm70.cu
+++ b/test/unit/gemm/device/gemm_splitk_tensor_op_sm70.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_splitk_tensor_op_sm75.cu b/test/unit/gemm/device/gemm_splitk_tensor_op_sm75.cu
index 136f3b33..2325c92b 100644
--- a/test/unit/gemm/device/gemm_splitk_tensor_op_sm75.cu
+++ b/test/unit/gemm/device/gemm_splitk_tensor_op_sm75.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_tf32n_tf32n_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_tf32n_tf32n_f32t_tensor_op_f32_sm80.cu
index 250c37e8..5a4644a6 100644
--- a/test/unit/gemm/device/gemm_tf32n_tf32n_f32t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_tf32n_tf32n_f32t_tensor_op_f32_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_tf32n_tf32t_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_tf32n_tf32t_f32t_tensor_op_f32_sm80.cu
index 1bbecd04..e5948c12 100644
--- a/test/unit/gemm/device/gemm_tf32n_tf32t_f32t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_tf32n_tf32t_f32t_tensor_op_f32_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_tf32t_tf32n_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_tf32t_tf32n_f32t_tensor_op_f32_sm80.cu
index b0d2fec8..3cab51d0 100644
--- a/test/unit/gemm/device/gemm_tf32t_tf32n_f32t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_tf32t_tf32n_f32t_tensor_op_f32_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_tf32t_tf32t_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_tf32t_tf32t_f32t_tensor_op_f32_sm80.cu
index 5bac31ab..bd8487be 100644
--- a/test/unit/gemm/device/gemm_tf32t_tf32t_f32t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_tf32t_tf32t_f32t_tensor_op_f32_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_u8t_u8n_s32t_wmma_tensor_op_s32_sm72.cu b/test/unit/gemm/device/gemm_u8t_u8n_s32t_wmma_tensor_op_s32_sm72.cu
index aa305ca7..1a98ef0b 100644
--- a/test/unit/gemm/device/gemm_u8t_u8n_s32t_wmma_tensor_op_s32_sm72.cu
+++ b/test/unit/gemm/device/gemm_u8t_u8n_s32t_wmma_tensor_op_s32_sm72.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_universal_cf32n_cf32n_cf32n_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_universal_cf32n_cf32n_cf32n_tensor_op_f32_sm80.cu
index a952e6d6..a3143943 100644
--- a/test/unit/gemm/device/gemm_universal_cf32n_cf32n_cf32n_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_universal_cf32n_cf32n_cf32n_tensor_op_f32_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu b/test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu
index fc13d647..6323d192 100644
--- a/test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu
+++ b/test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu b/test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu
index 36aeb411..87e7b535 100644
--- a/test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu
+++ b/test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_universal_f16n_f16t_f32n_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_universal_f16n_f16t_f32n_tensor_op_f32_sm75.cu
new file mode 100644
index 00000000..98cb75fd
--- /dev/null
+++ b/test/unit/gemm/device/gemm_universal_f16n_f16t_f32n_tensor_op_f32_sm75.cu
@@ -0,0 +1,117 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+    
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/device/gemm_universal.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_universal.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM75_Device_GemmUniversal_f16n_f16t_f32n_tensor_op_f32, 64x64x32_32x32x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::GemmUniversal<
+      cutlass::half_t, 
+      cutlass::layout::ColumnMajor, 
+      cutlass::half_t,
+      cutlass::layout::RowMajor, 
+      ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75,
+      cutlass::gemm::GemmShape<64, 64, 32>,
+      cutlass::gemm::GemmShape<32, 32, 32>, 
+      cutlass::gemm::GemmShape<16, 8, 8>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle, 
+      2>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal<Gemm>());
+}
+
+
+TEST(SM75_Device_GemmUniversal_f16n_f16t_f32n_tensor_op_f32, 64x64x32_32x32x32_updated_batch_count) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Gemm = cutlass::gemm::device::GemmUniversal<
+      cutlass::half_t, 
+      cutlass::layout::ColumnMajor, 
+      cutlass::half_t,
+      cutlass::layout::RowMajor, 
+      ElementOutput, cutlass::layout::ColumnMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75,
+      cutlass::gemm::GemmShape<64, 64, 32>,
+      cutlass::gemm::GemmShape<32, 32, 32>, 
+      cutlass::gemm::GemmShape<16, 8, 8>,
+      cutlass::epilogue::thread::LinearCombination<
+          ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+          ElementAccumulator, ElementAccumulator>,
+      cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle, 
+      2,
+      1,
+      1>;
+
+  EXPECT_TRUE(test::gemm::device::TestGemmUniversal<Gemm>(
+    {128, 128, 2}, 
+    cutlass::gemm::GemmUniversalMode::kGemm, 
+    15));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
+
+////////////////////////////////////////////////////////////////////////////////
+
diff --git a/test/unit/gemm/device/gemm_universal_f16n_f16t_f32t_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_universal_f16n_f16t_f32t_tensor_op_f32_sm75.cu
index ca44361b..e3b50da1 100644
--- a/test/unit/gemm/device/gemm_universal_f16n_f16t_f32t_tensor_op_f32_sm75.cu
+++ b/test/unit/gemm/device/gemm_universal_f16n_f16t_f32t_tensor_op_f32_sm75.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -49,7 +55,7 @@
 
 ////////////////////////////////////////////////////////////////////////////////
 
-TEST(SM75_Device_GemmUniversal_f16n_f16t_f32n_tensor_op_f32, 64x64x32_32x32x32) {
+TEST(SM75_Device_GemmUniversal_f16n_f16t_f32t_tensor_op_f32, 64x64x32_32x32x32) {
 
   using ElementOutput = float;
   using ElementAccumulator = float;
@@ -59,7 +65,7 @@ TEST(SM75_Device_GemmUniversal_f16n_f16t_f32n_tensor_op_f32, 64x64x32_32x32x32)
       cutlass::layout::ColumnMajor, 
       cutlass::half_t,
       cutlass::layout::RowMajor, 
-      ElementOutput, cutlass::layout::ColumnMajor,
+      ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75,
       cutlass::gemm::GemmShape<64, 64, 32>,
       cutlass::gemm::GemmShape<32, 32, 32>, 
@@ -74,7 +80,7 @@ TEST(SM75_Device_GemmUniversal_f16n_f16t_f32n_tensor_op_f32, 64x64x32_32x32x32)
 }
 
 
-TEST(SM75_Device_GemmUniversal_f16n_f16t_f32n_tensor_op_f32, 64x64x32_32x32x32_updated_batch_count) {
+TEST(SM75_Device_GemmUniversal_f16n_f16t_f32t_tensor_op_f32, 64x64x32_32x32x32_updated_batch_count) {
 
   using ElementOutput = float;
   using ElementAccumulator = float;
@@ -84,7 +90,7 @@ TEST(SM75_Device_GemmUniversal_f16n_f16t_f32n_tensor_op_f32, 64x64x32_32x32x32_u
       cutlass::layout::ColumnMajor, 
       cutlass::half_t,
       cutlass::layout::RowMajor, 
-      ElementOutput, cutlass::layout::ColumnMajor,
+      ElementOutput, cutlass::layout::RowMajor,
       ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75,
       cutlass::gemm::GemmShape<64, 64, 32>,
       cutlass::gemm::GemmShape<32, 32, 32>, 
diff --git a/test/unit/gemm/device/gemm_with_broadcast_f16n_f16n_f16n_tensorop_f32_sm75.cu b/test/unit/gemm/device/gemm_with_broadcast_f16n_f16n_f16n_tensorop_f32_sm75.cu
index a9c084f1..7af1a26d 100644
--- a/test/unit/gemm/device/gemm_with_broadcast_f16n_f16n_f16n_tensorop_f32_sm75.cu
+++ b/test/unit/gemm/device/gemm_with_broadcast_f16n_f16n_f16n_tensorop_f32_sm75.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_with_reduction_f16n_f16n_f16n_tensorop_f32_sm75.cu b/test/unit/gemm/device/gemm_with_reduction_f16n_f16n_f16n_tensorop_f32_sm75.cu
index df75946e..6e1e0d15 100644
--- a/test/unit/gemm/device/gemm_with_reduction_f16n_f16n_f16n_tensorop_f32_sm75.cu
+++ b/test/unit/gemm/device/gemm_with_reduction_f16n_f16n_f16n_tensorop_f32_sm75.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemm_with_reduction_f16t_f16n_f16n_tensorop_f32_sm80.cu b/test/unit/gemm/device/gemm_with_reduction_f16t_f16n_f16n_tensorop_f32_sm80.cu
index c4e1fcc3..fa1a9c3b 100644
--- a/test/unit/gemm/device/gemm_with_reduction_f16t_f16n_f16n_tensorop_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_with_reduction_f16t_f16n_f16n_tensorop_f32_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/gemv.cu b/test/unit/gemm/device/gemv.cu
index bdfc6c33..4a25b7e9 100644
--- a/test/unit/gemm/device/gemv.cu
+++ b/test/unit/gemm/device/gemv.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/hemm_cf32h_cf32n_tensor_op_f32_ls_sm80.cu b/test/unit/gemm/device/hemm_cf32h_cf32n_tensor_op_f32_ls_sm80.cu
new file mode 100644
index 00000000..aecefe5e
--- /dev/null
+++ b/test/unit/gemm/device/hemm_cf32h_cf32n_tensor_op_f32_ls_sm80.cu
@@ -0,0 +1,175 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide HEMM interface
+
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/symm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/symm_complex.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_symm_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Hemm_cf32h_cf32n_ls_l_tensor_op_f32, 32x32x16_16x16x16) {
+
+  using ElementOutput = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using Hemm = cutlass::gemm::device::Symm<
+    cutlass::complex<float>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    cutlass::complex<float>,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::BlasMode::kHermitian
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Hemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Hemm_cf32h_cf32n_ls_u_tensor_op_f32, 32x32x16_16x16x16) {
+
+  using ElementOutput = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using Hemm = cutlass::gemm::device::Symm<
+    cutlass::complex<float>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::complex<float>,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::BlasMode::kHermitian
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Hemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Hemm_cf32h_cf32n_ls_u_tensor_op_f32, 64x64x16_32x32x16) {
+
+  using ElementOutput = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using Hemm = cutlass::gemm::device::Symm<
+    cutlass::complex<float>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::complex<float>,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::BlasMode::kHermitian
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Hemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/hemm_cf32h_cf32n_tensor_op_f32_rs_sm80.cu b/test/unit/gemm/device/hemm_cf32h_cf32n_tensor_op_f32_rs_sm80.cu
new file mode 100644
index 00000000..b7da888b
--- /dev/null
+++ b/test/unit/gemm/device/hemm_cf32h_cf32n_tensor_op_f32_rs_sm80.cu
@@ -0,0 +1,175 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide HEMM interface
+
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/symm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/symm_complex.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_symm_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Hemm_cf32h_cf32n_rs_l_tensor_op_f32, 32x32x16_16x16x16) {
+
+  using ElementOutput = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using Hemm = cutlass::gemm::device::Symm<
+    cutlass::complex<float>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kLower,
+    cutlass::complex<float>,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::BlasMode::kHermitian
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Hemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Hemm_cf32h_cf32n_rs_u_tensor_op_f32, 32x32x16_16x16x16) {
+
+  using ElementOutput = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using Hemm = cutlass::gemm::device::Symm<
+    cutlass::complex<float>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kUpper,
+    cutlass::complex<float>,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::BlasMode::kHermitian
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Hemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Hemm_cf32h_cf32n_rs_u_tensor_op_f32, 64x64x16_32x32x16) {
+
+  using ElementOutput = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using Hemm = cutlass::gemm::device::Symm<
+    cutlass::complex<float>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kUpper,
+    cutlass::complex<float>,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::BlasMode::kHermitian
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Hemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/hemm_cf32h_cf32n_tensor_op_fast_f32_ls_sm80.cu b/test/unit/gemm/device/hemm_cf32h_cf32n_tensor_op_fast_f32_ls_sm80.cu
new file mode 100644
index 00000000..e7423a55
--- /dev/null
+++ b/test/unit/gemm/device/hemm_cf32h_cf32n_tensor_op_fast_f32_ls_sm80.cu
@@ -0,0 +1,175 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide HEMM interface
+
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/symm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/symm_complex.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_symm_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Hemm_cf32h_cf32n_ls_l_tensor_op_fast_f32, 32x32x16_16x16x16) {
+
+  using ElementOutput = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using Hemm = cutlass::gemm::device::Symm<
+    cutlass::complex<float>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    cutlass::complex<float>,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddComplexFastF32,
+    cutlass::BlasMode::kHermitian
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Hemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Hemm_cf32h_cf32n_ls_u_tensor_op_fast_f32, 32x32x16_16x16x16) {
+
+  using ElementOutput = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using Hemm = cutlass::gemm::device::Symm<
+    cutlass::complex<float>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::complex<float>,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddComplexFastF32,
+    cutlass::BlasMode::kHermitian
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Hemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Hemm_cf32h_cf32n_ls_u_tensor_op_fast_f32, 64x64x16_32x32x16) {
+
+  using ElementOutput = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using Hemm = cutlass::gemm::device::Symm<
+    cutlass::complex<float>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::complex<float>,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddComplexFastF32,
+    cutlass::BlasMode::kHermitian
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Hemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/hemm_cf32h_cf32n_tensor_op_fast_f32_rs_sm80.cu b/test/unit/gemm/device/hemm_cf32h_cf32n_tensor_op_fast_f32_rs_sm80.cu
new file mode 100644
index 00000000..2933c375
--- /dev/null
+++ b/test/unit/gemm/device/hemm_cf32h_cf32n_tensor_op_fast_f32_rs_sm80.cu
@@ -0,0 +1,175 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide HEMM interface
+
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/symm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/symm_complex.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_symm_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Hemm_cf32h_cf32n_rs_l_tensor_op_fast_f32, 32x32x16_16x16x16) {
+
+  using ElementOutput = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using Hemm = cutlass::gemm::device::Symm<
+    cutlass::complex<float>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kLower,
+    cutlass::complex<float>,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddComplexFastF32,
+    cutlass::BlasMode::kHermitian
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Hemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Hemm_cf32h_cf32n_rs_u_tensor_op_fast_f32, 32x32x16_16x16x16) {
+
+  using ElementOutput = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using Hemm = cutlass::gemm::device::Symm<
+    cutlass::complex<float>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kUpper,
+    cutlass::complex<float>,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddComplexFastF32,
+    cutlass::BlasMode::kHermitian
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Hemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Hemm_cf32h_cf32n_rs_u_tensor_op_fast_f32, 64x64x16_32x32x16) {
+
+  using ElementOutput = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using Hemm = cutlass::gemm::device::Symm<
+    cutlass::complex<float>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kUpper,
+    cutlass::complex<float>,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddComplexFastF32,
+    cutlass::BlasMode::kHermitian
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Hemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/hemm_cf64h_cf64n_cf64n_tensor_op_ls_f64_gaussian_sm80.cu b/test/unit/gemm/device/hemm_cf64h_cf64n_cf64n_tensor_op_ls_f64_gaussian_sm80.cu
new file mode 100644
index 00000000..a1568d82
--- /dev/null
+++ b/test/unit/gemm/device/hemm_cf64h_cf64n_cf64n_tensor_op_ls_f64_gaussian_sm80.cu
@@ -0,0 +1,175 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide HEMM interface
+
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/symm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/symm_complex.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_symm_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Hemm_cf64h_cf64n_ls_l_tensor_op_f64_gaussian, 32x32x16_16x16x16) {
+
+  using ElementOutput = cutlass::complex<double>;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Hemm = cutlass::gemm::device::Symm<
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddGaussianComplex,
+    cutlass::BlasMode::kHermitian
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Hemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Hemm_cf64h_cf64n_ls_u_tensor_op_f64_gaussian, 32x32x16_16x16x16) {
+
+  using ElementOutput = cutlass::complex<double>;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Hemm = cutlass::gemm::device::Symm<
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddGaussianComplex,
+    cutlass::BlasMode::kHermitian
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Hemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Hemm_cf64h_cf64n_ls_u_tensor_op_f64_gaussian, 64x64x16_32x32x16) {
+
+  using ElementOutput = cutlass::complex<double>;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Hemm = cutlass::gemm::device::Symm<
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddGaussianComplex,
+    cutlass::BlasMode::kHermitian
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Hemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/hemm_cf64h_cf64n_cf64n_tensor_op_ls_f64_sm80.cu b/test/unit/gemm/device/hemm_cf64h_cf64n_cf64n_tensor_op_ls_f64_sm80.cu
new file mode 100644
index 00000000..ca9bdbc5
--- /dev/null
+++ b/test/unit/gemm/device/hemm_cf64h_cf64n_cf64n_tensor_op_ls_f64_sm80.cu
@@ -0,0 +1,175 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide HEMM interface
+
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/symm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/symm_complex.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_symm_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Hemm_cf64h_cf64n_ls_l_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementOutput = cutlass::complex<double>;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Hemm = cutlass::gemm::device::Symm<
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::BlasMode::kHermitian
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Hemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Hemm_cf64h_cf64n_ls_u_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementOutput = cutlass::complex<double>;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Hemm = cutlass::gemm::device::Symm<
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::BlasMode::kHermitian
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Hemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Hemm_cf64h_cf64n_ls_u_tensor_op_f64, 64x64x16_32x32x16) {
+
+  using ElementOutput = cutlass::complex<double>;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Hemm = cutlass::gemm::device::Symm<
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::BlasMode::kHermitian
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Hemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/hemm_cf64h_cf64n_cf64n_tensor_op_rs_f64_sm80.cu b/test/unit/gemm/device/hemm_cf64h_cf64n_cf64n_tensor_op_rs_f64_sm80.cu
new file mode 100644
index 00000000..58570150
--- /dev/null
+++ b/test/unit/gemm/device/hemm_cf64h_cf64n_cf64n_tensor_op_rs_f64_sm80.cu
@@ -0,0 +1,175 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide HEMM interface
+
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/symm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/symm_complex.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_symm_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Hemm_cf64h_cf64n_rs_l_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementOutput = cutlass::complex<double>;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Hemm = cutlass::gemm::device::Symm<
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kLower,
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::BlasMode::kHermitian
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Hemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Hemm_cf64h_cf64n_rs_u_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementOutput = cutlass::complex<double>;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Hemm = cutlass::gemm::device::Symm<
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kUpper,
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::BlasMode::kHermitian
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Hemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Hemm_cf64h_cf64n_rs_u_tensor_op_f64, 64x64x16_32x32x16) {
+
+  using ElementOutput = cutlass::complex<double>;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Hemm = cutlass::gemm::device::Symm<
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kUpper,
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::BlasMode::kHermitian
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Hemm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/her2k_cf32h_cf32n_tensor_op_f32_sm80.cu b/test/unit/gemm/device/her2k_cf32h_cf32n_tensor_op_f32_sm80.cu
new file mode 100644
index 00000000..966c7519
--- /dev/null
+++ b/test/unit/gemm/device/her2k_cf32h_cf32n_tensor_op_f32_sm80.cu
@@ -0,0 +1,149 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide HER2K interface
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/rank_2k.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/rank_2k.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_rank2k_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Her2k_cf32n_cf32n_l_tensor_op_f32, 64x64x16_32x32x16) {
+
+  using ElementA = cutlass::complex<float>;
+  using LayoutA = cutlass::layout::ColumnMajor;
+
+  using ElementB = cutlass::complex<float>;
+  using LayoutB = cutlass::layout::ColumnMajor;
+
+  using ElementC = cutlass::complex<float>;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using Rank2K = cutlass::gemm::device::Rank2K<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,     // kStages 
+    1,     // AlignmentA
+    1,     // AlignmentB
+    false, // SplitKSerial
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone,
+    cutlass::BlasMode::kHermitian
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRank2KHermitianUniversal<Rank2K>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Her2k_cf32h_cf32n_l_tensor_op_f32, 64x64x16_32x32x16) {
+
+  using ElementA = cutlass::complex<float>;
+  using LayoutA = cutlass::layout::RowMajor;
+
+  using ElementB = cutlass::complex<float>;
+  using LayoutB = cutlass::layout::RowMajor;
+
+  using ElementC = cutlass::complex<float>;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using Rank2K = cutlass::gemm::device::Rank2K<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,     // kStages 
+    1,     // AlignmentA
+    1,     // AlignmentB
+    false, // SplitKSerial
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::ComplexTransform::kConjugate,
+    cutlass::ComplexTransform::kConjugate,
+    cutlass::BlasMode::kHermitian
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRank2KHermitianUniversal<Rank2K>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/her2k_cf32h_cf32n_tensor_op_fast_f32_sm80.cu b/test/unit/gemm/device/her2k_cf32h_cf32n_tensor_op_fast_f32_sm80.cu
new file mode 100644
index 00000000..3699c4ab
--- /dev/null
+++ b/test/unit/gemm/device/her2k_cf32h_cf32n_tensor_op_fast_f32_sm80.cu
@@ -0,0 +1,149 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide HER2K interface
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/rank_2k.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/rank_2k.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_rank2k_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Her2k_cf32n_cf32n_l_tensor_op_fast_f32, 64x64x16_32x32x16) {
+
+  using ElementA = cutlass::complex<float>;
+  using LayoutA = cutlass::layout::ColumnMajor;
+
+  using ElementB = cutlass::complex<float>;
+  using LayoutB = cutlass::layout::ColumnMajor;
+
+  using ElementC = cutlass::complex<float>;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using Rank2K = cutlass::gemm::device::Rank2K<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,     // kStages 
+    1,     // AlignmentA
+    1,     // AlignmentB
+    false, // SplitKSerial
+    cutlass::arch::OpMultiplyAddComplexFastF32,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone,
+    cutlass::BlasMode::kHermitian
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRank2KHermitianUniversal<Rank2K>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Her2k_cf32h_cf32n_l_tensor_op_fast_f32, 64x64x16_32x32x16) {
+
+  using ElementA = cutlass::complex<float>;
+  using LayoutA = cutlass::layout::RowMajor;
+
+  using ElementB = cutlass::complex<float>;
+  using LayoutB = cutlass::layout::RowMajor;
+
+  using ElementC = cutlass::complex<float>;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using Rank2K = cutlass::gemm::device::Rank2K<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,     // kStages 
+    1,     // AlignmentA
+    1,     // AlignmentB
+    false, // SplitKSerial
+    cutlass::arch::OpMultiplyAddComplexFastF32,
+    cutlass::ComplexTransform::kConjugate,
+    cutlass::ComplexTransform::kConjugate,
+    cutlass::BlasMode::kHermitian
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRank2KHermitianUniversal<Rank2K>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/her2k_cf64n_cf64n_tensor_op_f64_sm80.cu b/test/unit/gemm/device/her2k_cf64n_cf64n_tensor_op_f64_sm80.cu
new file mode 100644
index 00000000..eea6f667
--- /dev/null
+++ b/test/unit/gemm/device/her2k_cf64n_cf64n_tensor_op_f64_sm80.cu
@@ -0,0 +1,149 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide HER2K interface
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/rank_2k.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/rank_2k.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_rank2k_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Her2k_cf64n_cf64n_l_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = cutlass::complex<double>;
+  using LayoutA = cutlass::layout::ColumnMajor;
+
+  using ElementB = cutlass::complex<double>;
+  using LayoutB = cutlass::layout::ColumnMajor;
+
+  using ElementC = cutlass::complex<double>;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Rank2K = cutlass::gemm::device::Rank2K<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,     // kStages 
+    1,     // AlignmentA
+    1,     // AlignmentB
+    false, // SplitKSerial
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone,
+    cutlass::BlasMode::kHermitian
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRank2KHermitianUniversal<Rank2K>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Her2k_cf64h_cf64n_u_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = cutlass::complex<double>;
+  using LayoutA = cutlass::layout::RowMajor;
+
+  using ElementB = cutlass::complex<double>;
+  using LayoutB = cutlass::layout::RowMajor;
+
+  using ElementC = cutlass::complex<double>;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Rank2K = cutlass::gemm::device::Rank2K<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,     // kStages 
+    1,     // AlignmentA
+    1,     // AlignmentB
+    false, // SplitKSerial
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::ComplexTransform::kConjugate,
+    cutlass::ComplexTransform::kConjugate,
+    cutlass::BlasMode::kHermitian
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRank2KHermitianUniversal<Rank2K>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/her2k_cf64n_cf64t_tensor_op_f64_sm80.cu b/test/unit/gemm/device/her2k_cf64n_cf64t_tensor_op_f64_sm80.cu
new file mode 100644
index 00000000..5cb6309f
--- /dev/null
+++ b/test/unit/gemm/device/her2k_cf64n_cf64t_tensor_op_f64_sm80.cu
@@ -0,0 +1,201 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide HER2K interface
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/rank_2k.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/rank_2k.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_rank2k_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#if 0 // HER2K with RowMajor output is not supported 
+TEST(SM80_Device_Her2k_cf64n_cf64t_l_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = cutlass::complex<double>;
+  using LayoutA = cutlass::layout::ColumnMajor;
+
+  using ElementB = cutlass::complex<double>;
+  using LayoutB = cutlass::layout::ColumnMajor;
+
+  using ElementC = cutlass::complex<double>;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Rank2K = cutlass::gemm::device::Rank2K<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,     // kStages 
+    1,     // AlignmentA
+    1,     // AlignmentB
+    false, // SplitKSerial
+    cutlass::arch::OpMultiplyAddComplex,
+    false, // IsBetaZero
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone,
+    cutlass::BlasMode::kHermitian
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRank2KHermitianUniversal<Rank2K>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Her2k_cf64c_cf64t_u_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = cutlass::complex<double>;
+  using LayoutA = cutlass::layout::ColumnMajor;
+
+  using ElementB = cutlass::complex<double>;
+  using LayoutB = cutlass::layout::ColumnMajor;
+
+  using ElementC = cutlass::complex<double>;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Rank2K = cutlass::gemm::device::Rank2K<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,     // kStages 
+    1,     // AlignmentA
+    1,     // AlignmentB
+    false, // SplitKSerial
+    cutlass::arch::OpMultiplyAddComplex,
+    false, // IsBetaZero
+    cutlass::ComplexTransform::kConjugate,
+    cutlass::ComplexTransform::kConjugate,
+    cutlass::BlasMode::kHermitian
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRank2KHermitianUniversal<Rank2K>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Her2k_cf64h_cf64t_u_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = cutlass::complex<double>;
+  using LayoutA = cutlass::layout::RowMajor;
+
+  using ElementB = cutlass::complex<double>;
+  using LayoutB = cutlass::layout::RowMajor;
+
+  using ElementC = cutlass::complex<double>;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Rank2K = cutlass::gemm::device::Rank2K<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,     // kStages 
+    1,     // AlignmentA
+    1,     // AlignmentB
+    false, // SplitKSerial
+    cutlass::arch::OpMultiplyAddComplex,
+    false, // IsBetaZero
+    cutlass::ComplexTransform::kConjugate,
+    cutlass::ComplexTransform::kConjugate,
+    cutlass::BlasMode::kHermitian
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRank2KHermitianUniversal<Rank2K>());
+}
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/herk_cf32h_cf32n_tensor_op_f32_sm80.cu b/test/unit/gemm/device/herk_cf32h_cf32n_tensor_op_f32_sm80.cu
new file mode 100644
index 00000000..ca2a72e5
--- /dev/null
+++ b/test/unit/gemm/device/herk_cf32h_cf32n_tensor_op_f32_sm80.cu
@@ -0,0 +1,219 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide HERK interface
+
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/rank_k.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/rank_k_complex.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_rank_k_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// HERK operator on CUBLAS_OP_N (column-major) input layouts 
+TEST(SM80_Device_Herk_cf32n_cf32n_l_tensor_op_f32, 64x64x16_32x32x16) {
+
+  using ElementA = cutlass::complex<float>;
+  using LayoutA = cutlass::layout::ColumnMajor;
+
+  using ElementC = cutlass::complex<float>;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,     // kStages 
+    1,     // AlignmentA
+    false, // SplitKSerial
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::ComplexTransform::kNone,
+    cutlass::BlasMode::kHermitian
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// HERK operator on CUBLAS_OP_N (column-major) input layouts 
+TEST(SM80_Device_Herk_cf32n_cf32n_u_tensor_op_f32, 64x64x16_32x32x16) {
+
+  using ElementA = cutlass::complex<float>;
+  using LayoutA = cutlass::layout::ColumnMajor;
+
+  using ElementC = cutlass::complex<float>;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,     // kStages 
+    1,     // AlignmentA
+    false, // SplitKSerial
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::ComplexTransform::kNone,
+    cutlass::BlasMode::kHermitian
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// HERK operator on CUBLAS_OP_C (row-major + conj) input layouts
+TEST(SM80_Device_Herk_cf32h_cf32n_l_tensor_op_f32, 64x64x16_32x32x16) {
+
+  using ElementA = cutlass::complex<float>;
+  using LayoutA = cutlass::layout::RowMajor;
+
+  using ElementC = cutlass::complex<float>;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,     // kStages 
+    1,     // AlignmentA
+    false, // SplitKSerial
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::ComplexTransform::kConjugate,
+    cutlass::BlasMode::kHermitian
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// HERK operator on CUBLAS_OP_C (row-major + conj) input layouts
+TEST(SM80_Device_Herk_cf32h_cf32n_u_tensor_op_f32, 64x64x16_32x32x16) {
+
+  using ElementA = cutlass::complex<float>;
+  using LayoutA = cutlass::layout::RowMajor;
+
+  using ElementC = cutlass::complex<float>;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,     // kStages 
+    1,     // AlignmentA
+    false, // SplitKSerial
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::ComplexTransform::kConjugate,
+    cutlass::BlasMode::kHermitian
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/herk_cf32h_cf32n_tensor_op_fast_f32_sm80.cu b/test/unit/gemm/device/herk_cf32h_cf32n_tensor_op_fast_f32_sm80.cu
new file mode 100644
index 00000000..dbbd23f5
--- /dev/null
+++ b/test/unit/gemm/device/herk_cf32h_cf32n_tensor_op_fast_f32_sm80.cu
@@ -0,0 +1,219 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide HERK interface
+
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/rank_k.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/rank_k_complex.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_rank_k_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// HERK operator on CUBLAS_OP_N (column-major) input layouts 
+TEST(SM80_Device_Herk_cf32n_cf32n_l_tensor_op_fast_f32, 64x64x16_32x32x16) {
+
+  using ElementA = cutlass::complex<float>;
+  using LayoutA = cutlass::layout::ColumnMajor;
+
+  using ElementC = cutlass::complex<float>;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,     // kStages 
+    1,     // AlignmentA
+    false, // SplitKSerial
+    cutlass::arch::OpMultiplyAddComplexFastF32,
+    cutlass::ComplexTransform::kNone,
+    cutlass::BlasMode::kHermitian
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// HERK operator on CUBLAS_OP_N (column-major) input layouts 
+TEST(SM80_Device_Herk_cf32n_cf32n_u_tensor_op_fast_f32, 64x64x16_32x32x16) {
+
+  using ElementA = cutlass::complex<float>;
+  using LayoutA = cutlass::layout::ColumnMajor;
+
+  using ElementC = cutlass::complex<float>;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,     // kStages 
+    1,     // AlignmentA
+    false, // SplitKSerial
+    cutlass::arch::OpMultiplyAddComplexFastF32,
+    cutlass::ComplexTransform::kNone,
+    cutlass::BlasMode::kHermitian
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// HERK operator on CUBLAS_OP_C (row-major + conj) input layouts
+TEST(SM80_Device_Herk_cf32h_cf32n_l_tensor_op_fast_f32, 64x64x16_32x32x16) {
+
+  using ElementA = cutlass::complex<float>;
+  using LayoutA = cutlass::layout::RowMajor;
+
+  using ElementC = cutlass::complex<float>;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,     // kStages 
+    1,     // AlignmentA
+    false, // SplitKSerial
+    cutlass::arch::OpMultiplyAddComplexFastF32,
+    cutlass::ComplexTransform::kConjugate,
+    cutlass::BlasMode::kHermitian
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// HERK operator on CUBLAS_OP_C (row-major + conj) input layouts
+TEST(SM80_Device_Herk_cf32h_cf32n_u_tensor_op_fast_f32, 64x64x16_32x32x16) {
+
+  using ElementA = cutlass::complex<float>;
+  using LayoutA = cutlass::layout::RowMajor;
+
+  using ElementC = cutlass::complex<float>;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,     // kStages 
+    1,     // AlignmentA
+    false, // SplitKSerial
+    cutlass::arch::OpMultiplyAddComplexFastF32,
+    cutlass::ComplexTransform::kConjugate,
+    cutlass::BlasMode::kHermitian
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/herk_cf64h_cf64n_tensor_op_f64_sm80.cu b/test/unit/gemm/device/herk_cf64h_cf64n_tensor_op_f64_sm80.cu
new file mode 100644
index 00000000..1db9f930
--- /dev/null
+++ b/test/unit/gemm/device/herk_cf64h_cf64n_tensor_op_f64_sm80.cu
@@ -0,0 +1,175 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide HERK interface
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/rank_k.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/rank_k_complex.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_rank_k_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// HERK operator on CUBLAS_OP_N (column-major) input layouts 
+TEST(SM80_Device_Herk_cf64n_cf64n_l_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = cutlass::complex<double>;
+  using LayoutA = cutlass::layout::ColumnMajor;
+
+  using ElementC = cutlass::complex<double>;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,     // kStages 
+    1,     // AlignmentA
+    false, // SplitKSerial
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::ComplexTransform::kNone,
+    cutlass::BlasMode::kHermitian
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// HERK operator on CUBLAS_OP_N (column-major) input layouts
+TEST(SM80_Device_Herk_cf64n_cf64n_u_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = cutlass::complex<double>;
+  using LayoutA = cutlass::layout::ColumnMajor;
+
+  using ElementC = cutlass::complex<double>;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,     // kStages 
+    1,     // AlignmentA
+    false, // SplitKSerial
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::ComplexTransform::kNone,
+    cutlass::BlasMode::kHermitian
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// HERK operator on CUBLAS_OP_C (row-major + conj) input layouts
+TEST(SM80_Device_Herk_cf64h_cf64n_l_tensor_op_f64, 64x64x16_32x32x16) {
+
+  using ElementA = cutlass::complex<double>;
+  using LayoutA = cutlass::layout::RowMajor;
+
+  using ElementC = cutlass::complex<double>;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,     // kStages 
+    1,     // AlignmentA
+    false, // SplitKSerial
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::ComplexTransform::kConjugate,
+    cutlass::BlasMode::kHermitian
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/multistage_testbed.h b/test/unit/gemm/device/multistage_testbed.h
index b2cace85..dbc77b9d 100644
--- a/test/unit/gemm/device/multistage_testbed.h
+++ b/test/unit/gemm/device/multistage_testbed.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/multistage_testbed_interleaved.h b/test/unit/gemm/device/multistage_testbed_interleaved.h
index febbaba2..7b1ece85 100644
--- a/test/unit/gemm/device/multistage_testbed_interleaved.h
+++ b/test/unit/gemm/device/multistage_testbed_interleaved.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/simt_cgemm_nn_sm50.cu b/test/unit/gemm/device/simt_cgemm_nn_sm50.cu
index 90f596ea..6f68ebf3 100644
--- a/test/unit/gemm/device/simt_cgemm_nn_sm50.cu
+++ b/test/unit/gemm/device/simt_cgemm_nn_sm50.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/simt_cgemm_nt_sm50.cu b/test/unit/gemm/device/simt_cgemm_nt_sm50.cu
index 94d31205..de145b37 100644
--- a/test/unit/gemm/device/simt_cgemm_nt_sm50.cu
+++ b/test/unit/gemm/device/simt_cgemm_nt_sm50.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/simt_cgemm_tn_sm50.cu b/test/unit/gemm/device/simt_cgemm_tn_sm50.cu
index 6769763d..7f338db5 100644
--- a/test/unit/gemm/device/simt_cgemm_tn_sm50.cu
+++ b/test/unit/gemm/device/simt_cgemm_tn_sm50.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/simt_cgemm_tt_sm50.cu b/test/unit/gemm/device/simt_cgemm_tt_sm50.cu
index f93b8ec5..a734d87b 100644
--- a/test/unit/gemm/device/simt_cgemm_tt_sm50.cu
+++ b/test/unit/gemm/device/simt_cgemm_tt_sm50.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/simt_dgemm_nn_sm50.cu b/test/unit/gemm/device/simt_dgemm_nn_sm50.cu
index 606324da..1af40ecb 100644
--- a/test/unit/gemm/device/simt_dgemm_nn_sm50.cu
+++ b/test/unit/gemm/device/simt_dgemm_nn_sm50.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/simt_dgemm_nt_sm50.cu b/test/unit/gemm/device/simt_dgemm_nt_sm50.cu
index 09c52505..6271d707 100644
--- a/test/unit/gemm/device/simt_dgemm_nt_sm50.cu
+++ b/test/unit/gemm/device/simt_dgemm_nt_sm50.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/simt_dgemm_tn_sm50.cu b/test/unit/gemm/device/simt_dgemm_tn_sm50.cu
index d2a21ce8..c7e958a7 100644
--- a/test/unit/gemm/device/simt_dgemm_tn_sm50.cu
+++ b/test/unit/gemm/device/simt_dgemm_tn_sm50.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/simt_dgemm_tt_sm50.cu b/test/unit/gemm/device/simt_dgemm_tt_sm50.cu
index f8a99f5e..324c24b3 100644
--- a/test/unit/gemm/device/simt_dgemm_tt_sm50.cu
+++ b/test/unit/gemm/device/simt_dgemm_tt_sm50.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/simt_hgemm_nn_sm50.cu b/test/unit/gemm/device/simt_hgemm_nn_sm50.cu
index 8aa0443e..748add7c 100644
--- a/test/unit/gemm/device/simt_hgemm_nn_sm50.cu
+++ b/test/unit/gemm/device/simt_hgemm_nn_sm50.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/simt_hgemm_nt_sm50.cu b/test/unit/gemm/device/simt_hgemm_nt_sm50.cu
index 119c3ee6..5ddbc957 100644
--- a/test/unit/gemm/device/simt_hgemm_nt_sm50.cu
+++ b/test/unit/gemm/device/simt_hgemm_nt_sm50.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/simt_hgemm_tn_sm50.cu b/test/unit/gemm/device/simt_hgemm_tn_sm50.cu
index 1f48803a..0eeb53a4 100644
--- a/test/unit/gemm/device/simt_hgemm_tn_sm50.cu
+++ b/test/unit/gemm/device/simt_hgemm_tn_sm50.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/simt_hgemm_tt_sm50.cu b/test/unit/gemm/device/simt_hgemm_tt_sm50.cu
index cc4148ac..ede4cf1a 100644
--- a/test/unit/gemm/device/simt_hgemm_tt_sm50.cu
+++ b/test/unit/gemm/device/simt_hgemm_tt_sm50.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/simt_igemm_nn_sm50.cu b/test/unit/gemm/device/simt_igemm_nn_sm50.cu
index 186ae66b..b982ad11 100644
--- a/test/unit/gemm/device/simt_igemm_nn_sm50.cu
+++ b/test/unit/gemm/device/simt_igemm_nn_sm50.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/simt_igemm_nt_sm50.cu b/test/unit/gemm/device/simt_igemm_nt_sm50.cu
index 1d544467..33abeb82 100644
--- a/test/unit/gemm/device/simt_igemm_nt_sm50.cu
+++ b/test/unit/gemm/device/simt_igemm_nt_sm50.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/simt_igemm_tn_sm50.cu b/test/unit/gemm/device/simt_igemm_tn_sm50.cu
index f2b198e3..d6a64c6e 100644
--- a/test/unit/gemm/device/simt_igemm_tn_sm50.cu
+++ b/test/unit/gemm/device/simt_igemm_tn_sm50.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/simt_igemm_tt_sm50.cu b/test/unit/gemm/device/simt_igemm_tt_sm50.cu
index 7109b0ee..d68400a8 100644
--- a/test/unit/gemm/device/simt_igemm_tt_sm50.cu
+++ b/test/unit/gemm/device/simt_igemm_tt_sm50.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/simt_int8_igemm_sm61.cu b/test/unit/gemm/device/simt_int8_igemm_sm61.cu
index 30a331c3..f99c4652 100644
--- a/test/unit/gemm/device/simt_int8_igemm_sm61.cu
+++ b/test/unit/gemm/device/simt_int8_igemm_sm61.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/simt_int8_igemm_sm61_perf.cu b/test/unit/gemm/device/simt_int8_igemm_sm61_perf.cu
index 6a22161c..f221b14d 100644
--- a/test/unit/gemm/device/simt_int8_igemm_sm61_perf.cu
+++ b/test/unit/gemm/device/simt_int8_igemm_sm61_perf.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/simt_int8_igemm_sm61_sliced_k.cu b/test/unit/gemm/device/simt_int8_igemm_sm61_sliced_k.cu
index d06674f7..57f107d9 100644
--- a/test/unit/gemm/device/simt_int8_igemm_sm61_sliced_k.cu
+++ b/test/unit/gemm/device/simt_int8_igemm_sm61_sliced_k.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/simt_qgemm_nn_sm50.cu b/test/unit/gemm/device/simt_qgemm_nn_sm50.cu
index 734d7e08..0db0240d 100644
--- a/test/unit/gemm/device/simt_qgemm_nn_sm50.cu
+++ b/test/unit/gemm/device/simt_qgemm_nn_sm50.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/simt_qgemm_nt_sm50.cu b/test/unit/gemm/device/simt_qgemm_nt_sm50.cu
index e9906bc4..36fd261d 100644
--- a/test/unit/gemm/device/simt_qgemm_nt_sm50.cu
+++ b/test/unit/gemm/device/simt_qgemm_nt_sm50.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/simt_qgemm_tn_sm50.cu b/test/unit/gemm/device/simt_qgemm_tn_sm50.cu
index 0537fa67..191b057a 100644
--- a/test/unit/gemm/device/simt_qgemm_tn_sm50.cu
+++ b/test/unit/gemm/device/simt_qgemm_tn_sm50.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/simt_qgemm_tt_sm50.cu b/test/unit/gemm/device/simt_qgemm_tt_sm50.cu
index 10a67b12..1b531f98 100644
--- a/test/unit/gemm/device/simt_qgemm_tt_sm50.cu
+++ b/test/unit/gemm/device/simt_qgemm_tt_sm50.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/simt_sgemm_nn_sm50.cu b/test/unit/gemm/device/simt_sgemm_nn_sm50.cu
index ca161ed4..2fe11c1f 100644
--- a/test/unit/gemm/device/simt_sgemm_nn_sm50.cu
+++ b/test/unit/gemm/device/simt_sgemm_nn_sm50.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/simt_sgemm_nt_sm50.cu b/test/unit/gemm/device/simt_sgemm_nt_sm50.cu
index 968b759c..1909f23d 100644
--- a/test/unit/gemm/device/simt_sgemm_nt_sm50.cu
+++ b/test/unit/gemm/device/simt_sgemm_nt_sm50.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/simt_sgemm_nt_sm80.cu b/test/unit/gemm/device/simt_sgemm_nt_sm80.cu
index 401ffd26..eb0e3481 100644
--- a/test/unit/gemm/device/simt_sgemm_nt_sm80.cu
+++ b/test/unit/gemm/device/simt_sgemm_nt_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/simt_sgemm_tn_sm50.cu b/test/unit/gemm/device/simt_sgemm_tn_sm50.cu
index eb8355d3..a1b99337 100644
--- a/test/unit/gemm/device/simt_sgemm_tn_sm50.cu
+++ b/test/unit/gemm/device/simt_sgemm_tn_sm50.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/simt_sgemm_tn_sm80.cu b/test/unit/gemm/device/simt_sgemm_tn_sm80.cu
index a7810a5d..975b7375 100644
--- a/test/unit/gemm/device/simt_sgemm_tn_sm80.cu
+++ b/test/unit/gemm/device/simt_sgemm_tn_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/simt_sgemm_tt_sm50.cu b/test/unit/gemm/device/simt_sgemm_tt_sm50.cu
index daf722ce..f855c4a2 100644
--- a/test/unit/gemm/device/simt_sgemm_tt_sm50.cu
+++ b/test/unit/gemm/device/simt_sgemm_tt_sm50.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/simt_sm50.py b/test/unit/gemm/device/simt_sm50.py
index 87fbea8f..3f57d1b8 100644
--- a/test/unit/gemm/device/simt_sm50.py
+++ b/test/unit/gemm/device/simt_sm50.py
@@ -1,23 +1,29 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 # this file creates the test/unit/gemm/device simt tests
@@ -124,27 +130,33 @@
 
         # write file header
         out.write("/***************************************************************************************************\n"
-" * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.\n"
-" *\n"
-" * Redistribution and use in source and binary forms, with or without modification, are permitted\n"
-" * provided that the following conditions are met:\n"
-" *     * Redistributions of source code must retain the above copyright notice, this list of\n"
-" *       conditions and the following disclaimer.\n"
-" *     * Redistributions in binary form must reproduce the above copyright notice, this list of\n"
-" *       conditions and the following disclaimer in the documentation and/or other materials\n"
-" *       provided with the distribution.\n"
-" *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used\n"
-" *       to endorse or promote products derived from this software without specific prior written\n"
-" *       permission.\n"
-" *\n"
-" * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND ANY EXPRESS OR\n"
-" * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND\n"
-" * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE\n"
-" * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,\n"
-" * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;\n"
-" * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,\n"
-" * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n"
-" * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n"
+" * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.                 \n"
+" * SPDX-License-Identifier: BSD-3-Clause                                                           \n"
+" *                                                                                                 \n"
+" * Redistribution and use in source and binary forms, with or without                              \n"
+" * modification, are permitted provided that the following conditions are met:                     \n"
+" *                                                                                                 \n"
+" * 1. Redistributions of source code must retain the above copyright notice, this                  \n"
+" * list of conditions and the following disclaimer.                                                \n"
+" *                                                                                                 \n"
+" * 2. Redistributions in binary form must reproduce the above copyright notice,                    \n"
+" * this list of conditions and the following disclaimer in the documentation                       \n"
+" * and/or other materials provided with the distribution.                                          \n"
+" *                                                                                                 \n"
+" * 3. Neither the name of the copyright holder nor the names of its                                \n"
+" * contributors may be used to endorse or promote products derived from                            \n"
+" * this software without specific prior written permission.                                        \n"
+" *                                                                                                 \n"
+" * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\"                   \n"
+" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE                       \n"
+" * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE                  \n"
+" * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE                    \n"
+" * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL                      \n"
+" * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR                      \n"
+" * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER                      \n"
+" * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,                   \n"
+" * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE                   \n"
+" * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.                            \n"
 " *\n"
 " **************************************************************************************************/\n"
 "/*! \\file\n"
diff --git a/test/unit/gemm/device/simt_zgemm_nn_sm50.cu b/test/unit/gemm/device/simt_zgemm_nn_sm50.cu
index ceea09d2..30c024a0 100644
--- a/test/unit/gemm/device/simt_zgemm_nn_sm50.cu
+++ b/test/unit/gemm/device/simt_zgemm_nn_sm50.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/simt_zgemm_nt_sm50.cu b/test/unit/gemm/device/simt_zgemm_nt_sm50.cu
index 7d6f999b..c47d76ae 100644
--- a/test/unit/gemm/device/simt_zgemm_nt_sm50.cu
+++ b/test/unit/gemm/device/simt_zgemm_nt_sm50.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/simt_zgemm_tn_sm50.cu b/test/unit/gemm/device/simt_zgemm_tn_sm50.cu
index 752d7fa6..62286d95 100644
--- a/test/unit/gemm/device/simt_zgemm_tn_sm50.cu
+++ b/test/unit/gemm/device/simt_zgemm_tn_sm50.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/simt_zgemm_tt_sm50.cu b/test/unit/gemm/device/simt_zgemm_tt_sm50.cu
index b3a80cdb..c9c0af2b 100644
--- a/test/unit/gemm/device/simt_zgemm_tt_sm50.cu
+++ b/test/unit/gemm/device/simt_zgemm_tt_sm50.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/symm_cf32n_cf32n_tensor_op_f32_ls_sm80.cu b/test/unit/gemm/device/symm_cf32n_cf32n_tensor_op_f32_ls_sm80.cu
new file mode 100644
index 00000000..5d2389fb
--- /dev/null
+++ b/test/unit/gemm/device/symm_cf32n_cf32n_tensor_op_f32_ls_sm80.cu
@@ -0,0 +1,172 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide SYMM interface
+
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/symm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/symm_complex.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_symm_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_cf32n_cf32n_ls_l_tensor_op_f32, 32x32x16_16x16x16) {
+
+  using ElementOutput = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using Symm = cutlass::gemm::device::Symm<
+    cutlass::complex<float>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    cutlass::complex<float>,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddComplex
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_cf32n_cf32n_ls_u_tensor_op_f32, 32x32x16_16x16x16) {
+
+  using ElementOutput = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using Symm = cutlass::gemm::device::Symm<
+    cutlass::complex<float>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::complex<float>,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddComplex
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_cf32n_cf32n_ls_u_tensor_op_f32, 64x64x16_32x32x16) {
+
+  using ElementOutput = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using Symm = cutlass::gemm::device::Symm<
+    cutlass::complex<float>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::complex<float>,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddComplex
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/symm_cf32n_cf32n_tensor_op_f32_rs_sm80.cu b/test/unit/gemm/device/symm_cf32n_cf32n_tensor_op_f32_rs_sm80.cu
new file mode 100644
index 00000000..00584d40
--- /dev/null
+++ b/test/unit/gemm/device/symm_cf32n_cf32n_tensor_op_f32_rs_sm80.cu
@@ -0,0 +1,172 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide SYMM interface
+
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/symm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/symm_complex.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_symm_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_cf32n_cf32n_rs_l_tensor_op_f32, 32x32x16_16x16x16) {
+
+  using ElementOutput = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using Symm = cutlass::gemm::device::Symm<
+    cutlass::complex<float>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kLower,
+    cutlass::complex<float>,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddComplex
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_cf32n_cf32n_rs_u_tensor_op_f32, 32x32x16_16x16x16) {
+
+  using ElementOutput = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using Symm = cutlass::gemm::device::Symm<
+    cutlass::complex<float>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kUpper,
+    cutlass::complex<float>,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddComplex
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_cf32n_cf32n_rs_u_tensor_op_f32, 64x64x16_32x32x16) {
+
+  using ElementOutput = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using Symm = cutlass::gemm::device::Symm<
+    cutlass::complex<float>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kUpper,
+    cutlass::complex<float>,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddComplex
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/symm_cf32n_cf32n_tensor_op_fast_f32_ls_sm80.cu b/test/unit/gemm/device/symm_cf32n_cf32n_tensor_op_fast_f32_ls_sm80.cu
new file mode 100644
index 00000000..b014d0de
--- /dev/null
+++ b/test/unit/gemm/device/symm_cf32n_cf32n_tensor_op_fast_f32_ls_sm80.cu
@@ -0,0 +1,172 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide SYMM interface
+
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/symm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/symm_complex.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_symm_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_cf32n_cf32n_ls_l_tensor_op_fast_f32, 32x32x16_16x16x16) {
+
+  using ElementOutput = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using Symm = cutlass::gemm::device::Symm<
+    cutlass::complex<float>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    cutlass::complex<float>,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddComplexFastF32
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_cf32n_cf32n_ls_u_tensor_op_fast_f32, 32x32x16_16x16x16) {
+
+  using ElementOutput = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using Symm = cutlass::gemm::device::Symm<
+    cutlass::complex<float>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::complex<float>,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddComplexFastF32
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_cf32n_cf32n_ls_u_tensor_op_fast_f32, 64x64x16_32x32x16) {
+
+  using ElementOutput = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using Symm = cutlass::gemm::device::Symm<
+    cutlass::complex<float>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::complex<float>,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddComplexFastF32
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/symm_cf32n_cf32n_tensor_op_fast_f32_rs_sm80.cu b/test/unit/gemm/device/symm_cf32n_cf32n_tensor_op_fast_f32_rs_sm80.cu
new file mode 100644
index 00000000..f28e12b8
--- /dev/null
+++ b/test/unit/gemm/device/symm_cf32n_cf32n_tensor_op_fast_f32_rs_sm80.cu
@@ -0,0 +1,172 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide SYMM interface
+
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/symm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/symm_complex.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_symm_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_cf32n_cf32n_rs_l_tensor_op_fast_f32, 32x32x16_16x16x16) {
+
+  using ElementOutput = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using Symm = cutlass::gemm::device::Symm<
+    cutlass::complex<float>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kLower,
+    cutlass::complex<float>,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddComplexFastF32
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_cf32n_cf32n_rs_u_tensor_op_fast_f32, 32x32x16_16x16x16) {
+
+  using ElementOutput = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using Symm = cutlass::gemm::device::Symm<
+    cutlass::complex<float>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kUpper,
+    cutlass::complex<float>,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddComplexFastF32
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_cf32n_cf32n_rs_u_tensor_op_fast_f32, 64x64x16_32x32x16) {
+
+  using ElementOutput = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using Symm = cutlass::gemm::device::Symm<
+    cutlass::complex<float>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kUpper,
+    cutlass::complex<float>,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddComplexFastF32
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/symm_cf64n_cf64n_cf64n_tensor_op_ls_f64_gaussian_sm80.cu b/test/unit/gemm/device/symm_cf64n_cf64n_cf64n_tensor_op_ls_f64_gaussian_sm80.cu
new file mode 100644
index 00000000..743338b1
--- /dev/null
+++ b/test/unit/gemm/device/symm_cf64n_cf64n_cf64n_tensor_op_ls_f64_gaussian_sm80.cu
@@ -0,0 +1,172 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide SYMM interface
+
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/symm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/symm_complex.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_symm_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_cf64n_cf64n_ls_l_tensor_op_f64_gaussian, 32x32x16_16x16x16) {
+
+  using ElementOutput = cutlass::complex<double>;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Symm = cutlass::gemm::device::Symm<
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddGaussianComplex
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_cf64n_cf64n_ls_u_tensor_op_f64_gaussian, 32x32x16_16x16x16) {
+
+  using ElementOutput = cutlass::complex<double>;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Symm = cutlass::gemm::device::Symm<
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddGaussianComplex
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_cf64n_cf64n_ls_u_tensor_op_f64_gaussian, 64x64x16_32x32x16) {
+
+  using ElementOutput = cutlass::complex<double>;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Symm = cutlass::gemm::device::Symm<
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddGaussianComplex
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/symm_cf64n_cf64n_cf64n_tensor_op_ls_f64_sm80.cu b/test/unit/gemm/device/symm_cf64n_cf64n_cf64n_tensor_op_ls_f64_sm80.cu
new file mode 100644
index 00000000..3b4a6ba0
--- /dev/null
+++ b/test/unit/gemm/device/symm_cf64n_cf64n_cf64n_tensor_op_ls_f64_sm80.cu
@@ -0,0 +1,172 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide SYMM interface
+
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/symm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/symm_complex.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_symm_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_cf64n_cf64n_ls_l_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementOutput = cutlass::complex<double>;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Symm = cutlass::gemm::device::Symm<
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddComplex
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_cf64n_cf64n_ls_u_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementOutput = cutlass::complex<double>;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Symm = cutlass::gemm::device::Symm<
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddComplex
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_cf64n_cf64n_ls_u_tensor_op_f64, 64x64x16_32x32x16) {
+
+  using ElementOutput = cutlass::complex<double>;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Symm = cutlass::gemm::device::Symm<
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddComplex
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/symm_cf64n_cf64n_cf64n_tensor_op_rs_f64_sm80.cu b/test/unit/gemm/device/symm_cf64n_cf64n_cf64n_tensor_op_rs_f64_sm80.cu
new file mode 100644
index 00000000..97cbf5c9
--- /dev/null
+++ b/test/unit/gemm/device/symm_cf64n_cf64n_cf64n_tensor_op_rs_f64_sm80.cu
@@ -0,0 +1,172 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide SYMM interface
+
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/symm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/symm_complex.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_symm_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_cf64n_cf64n_rs_l_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementOutput = cutlass::complex<double>;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Symm = cutlass::gemm::device::Symm<
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kLower,
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddComplex
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_cf64n_cf64n_rs_u_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementOutput = cutlass::complex<double>;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Symm = cutlass::gemm::device::Symm<
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kUpper,
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddComplex
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_cf64n_cf64n_rs_u_tensor_op_f64, 64x64x16_32x32x16) {
+
+  using ElementOutput = cutlass::complex<double>;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Symm = cutlass::gemm::device::Symm<
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kUpper,
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddComplex
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/symm_f32n_f32n_tensor_op_fast_f32_ls_sm80.cu b/test/unit/gemm/device/symm_f32n_f32n_tensor_op_fast_f32_ls_sm80.cu
new file mode 100644
index 00000000..b31c0015
--- /dev/null
+++ b/test/unit/gemm/device/symm_f32n_f32n_tensor_op_fast_f32_ls_sm80.cu
@@ -0,0 +1,489 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide SYMM interface
+
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/symm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/symm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_symm_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////Test name//////////////////////////////////////////////////
+//                             
+// SM80_Device_Symm_{ElementA/B}{LayoutA/B}_{ElementC}{LayoutC}_{SideMode}_{FillMode}\
+//     _tensor_op_{ElementAccumulator}_align{AlignmentA}_align{AlignmentB}
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Symm_f32n_f32n_ls_l_tensor_op_fast_f32_align1_align1, 64x64x32_32x32x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Symm = cutlass::gemm::device::Symm<
+    float,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    float,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<32, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f32n_f32n_ls_l_tensor_op_fast_f32_align1_align4, 64x64x32_32x32x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Symm = cutlass::gemm::device::Symm<
+    float,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    float,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<32, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f32n_f32n_ls_l_tensor_op_fast_f32_align1_align4, 128x128x32_64x64x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Symm = cutlass::gemm::device::Symm<
+    float,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    float,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f32n_f32n_ls_l_tensor_op_fast_f32_align1_align4, 256x128x16_64x64x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Symm = cutlass::gemm::device::Symm<
+    float,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    float,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 128, 16>,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f32n_f32n_ls_l_tensor_op_fast_f32_align1_align4, 128x256x32_64x64x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Symm = cutlass::gemm::device::Symm<
+    float,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    float,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 256, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+  >;
+  
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f32n_f32n_ls_l_tensor_op_fast_f32_align1_align4, 256x128x32_64x64x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Symm = cutlass::gemm::device::Symm<
+    float,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    float,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f32n_f32n_ls_u_tensor_op_fast_f32_align1_align4, 64x64x16_32x32x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Symm = cutlass::gemm::device::Symm<
+    float,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    float,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    10,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f32n_f32n_ls_u_tensor_op_fast_f32_align1_align4, 128x128x16_64x64x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Symm = cutlass::gemm::device::Symm<
+    float,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    float,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f32n_f32n_ls_u_tensor_op_fast_f32_align1_align4, 256x128x16_128x64x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Symm = cutlass::gemm::device::Symm<
+    float,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    float,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 128, 16>,
+    cutlass::gemm::GemmShape<128, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f32n_f32n_ls_u_tensor_op_fast_f32_align1_align4, 128x256x16_64x128x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Symm = cutlass::gemm::device::Symm<
+    float,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    float,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 256, 16>,
+    cutlass::gemm::GemmShape<64, 128, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+  >;
+  
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f32n_f32n_ls_u_tensor_op_fast_f32_align1_align4, 256x256x16_64x128x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Symm = cutlass::gemm::device::Symm<
+    float,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    float,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 256, 16>,
+    cutlass::gemm::GemmShape<64, 128, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/symm_f32n_f32n_tensor_op_fast_f32_rs_sm80.cu b/test/unit/gemm/device/symm_f32n_f32n_tensor_op_fast_f32_rs_sm80.cu
new file mode 100644
index 00000000..41332b7d
--- /dev/null
+++ b/test/unit/gemm/device/symm_f32n_f32n_tensor_op_fast_f32_rs_sm80.cu
@@ -0,0 +1,276 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide SYMM interface
+
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/symm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/symm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_symm_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////Test name//////////////////////////////////////////////////
+//                             
+// SM80_Device_Symm_{ElementA/B}{LayoutA/B}_{ElementC}{LayoutC}_{SideMode}_{FillMode}\
+//     _tensor_op_{ElementAccumulator}_align{AlignmentA}_align{AlignmentB}
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f32n_f32n_rs_u_tensor_op_fast_f32_align1_align1, 64x128x32_32x64x32) {
+
+using Symm = cutlass::gemm::device::Symm<
+    float,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight, 
+    cutlass::FillMode::kUpper,
+    float,
+    cutlass::layout::ColumnMajor,
+    float,
+    cutlass::layout::ColumnMajor,
+    float,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 32>,
+    cutlass::gemm::GemmShape<32, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      float,
+      1,
+      float,
+      float
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>,
+    3,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f32n_f32n_rs_u_tensor_op_fast_f32_align1_align1, 128x64x32_32x64x32) {
+
+using Symm = cutlass::gemm::device::Symm<
+    float,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight, 
+    cutlass::FillMode::kUpper,
+    float,
+    cutlass::layout::ColumnMajor,
+    float,
+    cutlass::layout::ColumnMajor,
+    float,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 32>,
+    cutlass::gemm::GemmShape<32, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      float,
+      1,
+      float,
+      float
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>,
+    3,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f32n_f32n_rs_l_tensor_op_fast_f32_align1_align1, 64x128x32_32x64x32) {
+
+using Symm = cutlass::gemm::device::Symm<
+    float,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight, 
+    cutlass::FillMode::kLower,
+    float,
+    cutlass::layout::ColumnMajor,
+    float,
+    cutlass::layout::ColumnMajor,
+    float,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 32>,
+    cutlass::gemm::GemmShape<32, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      float,
+      1,
+      float,
+      float
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>,
+    3,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+TEST(SM80_Device_Symm_f32n_f32n_rs_u_tensor_op_fast_f32_align1_align4, 64x128x32_32x64x32) {
+
+using Symm = cutlass::gemm::device::Symm<
+    float,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight, 
+    cutlass::FillMode::kUpper,
+    float,
+    cutlass::layout::ColumnMajor,
+    float,
+    cutlass::layout::ColumnMajor,
+    float,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 32>,
+    cutlass::gemm::GemmShape<32, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      float,
+      1,
+      float,
+      float
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f32n_f32n_rs_u_tensor_op_fast_f32_align1_align4, 128x64x32_32x64x32) {
+
+using Symm = cutlass::gemm::device::Symm<
+    float,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight, 
+    cutlass::FillMode::kUpper,
+    float,
+    cutlass::layout::ColumnMajor,
+    float,
+    cutlass::layout::ColumnMajor,
+    float,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 32>,
+    cutlass::gemm::GemmShape<32, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      float,
+      1,
+      float,
+      float
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f32n_f32n_rs_l_tensor_op_fast_f32_align1_align4, 64x128x32_32x64x32) {
+
+using Symm = cutlass::gemm::device::Symm<
+    float,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight, 
+    cutlass::FillMode::kLower,
+    float,
+    cutlass::layout::ColumnMajor,
+    float,
+    cutlass::layout::ColumnMajor,
+    float,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 32>,
+    cutlass::gemm::GemmShape<32, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      float,
+      1,
+      float,
+      float
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/symm_f32t_f32t_tensor_op_fast_f32_ls_sm80.cu b/test/unit/gemm/device/symm_f32t_f32t_tensor_op_fast_f32_ls_sm80.cu
new file mode 100644
index 00000000..d8ec9ff3
--- /dev/null
+++ b/test/unit/gemm/device/symm_f32t_f32t_tensor_op_fast_f32_ls_sm80.cu
@@ -0,0 +1,489 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide SYMM interface
+
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/symm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/symm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_symm_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////Test name//////////////////////////////////////////////////
+//                             
+// SM80_Device_Symm_{ElementA/B}{LayoutA/B}_{ElementC}{LayoutC}_{SideMode}_{FillMode}\
+//     _tensor_op_{ElementAccumulator}_align{AlignmentA}_align{AlignmentB}
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Symm_f32t_f32t_ls_l_tensor_op_fast_f32_align1_align1, 64x64x32_32x32x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Symm = cutlass::gemm::device::Symm<
+    float,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    float,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<32, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f32t_f32t_ls_l_tensor_op_fast_f32_align1_align4, 64x64x32_32x32x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Symm = cutlass::gemm::device::Symm<
+    float,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    float,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<32, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f32t_f32t_ls_l_tensor_op_fast_f32_align1_align4, 128x128x32_64x64x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Symm = cutlass::gemm::device::Symm<
+    float,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    float,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f32t_f32t_ls_l_tensor_op_fast_f32_align1_align4, 256x128x16_64x64x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Symm = cutlass::gemm::device::Symm<
+    float,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    float,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 128, 16>,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f32t_f32t_ls_l_tensor_op_fast_f32_align1_align4, 128x256x32_64x64x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Symm = cutlass::gemm::device::Symm<
+    float,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    float,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 256, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+  >;
+  
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f32t_f32t_ls_l_tensor_op_fast_f32_align1_align4, 256x128x32_64x64x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Symm = cutlass::gemm::device::Symm<
+    float,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    float,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f32t_f32t_ls_u_tensor_op_fast_f32_align1_align4, 64x64x16_32x32x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Symm = cutlass::gemm::device::Symm<
+    float,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    float,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    10,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f32t_f32t_ls_u_tensor_op_fast_f32_align1_align4, 128x128x16_64x64x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Symm = cutlass::gemm::device::Symm<
+    float,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    float,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f32t_f32t_ls_u_tensor_op_fast_f32_align1_align4, 256x128x16_128x64x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Symm = cutlass::gemm::device::Symm<
+    float,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    float,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 128, 16>,
+    cutlass::gemm::GemmShape<128, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f32t_f32t_ls_u_tensor_op_fast_f32_align1_align4, 128x256x16_64x128x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Symm = cutlass::gemm::device::Symm<
+    float,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    float,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 256, 16>,
+    cutlass::gemm::GemmShape<64, 128, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+  >;
+  
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f32t_f32t_ls_u_tensor_op_fast_f32_align1_align4, 256x256x16_64x128x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Symm = cutlass::gemm::device::Symm<
+    float,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    float,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 256, 16>,
+    cutlass::gemm::GemmShape<64, 128, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/symm_f64n_f64n_tensor_op_f64_ls_sm80.cu b/test/unit/gemm/device/symm_f64n_f64n_tensor_op_f64_ls_sm80.cu
new file mode 100644
index 00000000..191cc916
--- /dev/null
+++ b/test/unit/gemm/device/symm_f64n_f64n_tensor_op_f64_ls_sm80.cu
@@ -0,0 +1,258 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide SYMM interface
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/symm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/symm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_symm_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f64n_f64n_ls_l_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = double;
+
+  using Symm = cutlass::gemm::device::Symm<
+    ElementA,
+    LayoutA,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f64n_f64n_ls_l_tensor_op_f64, 64x64x16_32x32x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = double;
+
+  using Symm = cutlass::gemm::device::Symm<
+    ElementA,
+    LayoutA,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f64n_f64n_ls_l_tensor_op_f64, 128x64x16_64x32x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = double;
+
+  using Symm = cutlass::gemm::device::Symm<
+    ElementA,
+    LayoutA,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 16>,
+    cutlass::gemm::GemmShape<64, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f64n_f64n_ls_u_tensor_op_f64, 128x128x16_32x64x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = double;
+
+  using Symm = cutlass::gemm::device::Symm<
+    ElementA,
+    LayoutA,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f64n_f64n_ls_u_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = double;
+
+  using Symm = cutlass::gemm::device::Symm<
+    ElementA,
+    LayoutA,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/symm_f64n_f64n_tensor_op_f64_rs_sm80.cu b/test/unit/gemm/device/symm_f64n_f64n_tensor_op_f64_rs_sm80.cu
new file mode 100644
index 00000000..d2ab3585
--- /dev/null
+++ b/test/unit/gemm/device/symm_f64n_f64n_tensor_op_f64_rs_sm80.cu
@@ -0,0 +1,258 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide SYMM interface
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/symm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/symm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_symm_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f64n_f64n_rs_l_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = double;
+
+  using Symm = cutlass::gemm::device::Symm<
+    ElementA,
+    LayoutA,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kLower,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f64n_f64n_rs_l_tensor_op_f64, 64x64x16_32x32x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = double;
+
+  using Symm = cutlass::gemm::device::Symm<
+    ElementA,
+    LayoutA,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kLower,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f64n_f64n_rs_l_tensor_op_f64, 128x64x16_64x32x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = double;
+
+  using Symm = cutlass::gemm::device::Symm<
+    ElementA,
+    LayoutA,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kLower,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 16>,
+    cutlass::gemm::GemmShape<64, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f64n_f64n_rs_u_tensor_op_f64, 128x128x16_32x64x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = double;
+
+  using Symm = cutlass::gemm::device::Symm<
+    ElementA,
+    LayoutA,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kUpper,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f64n_f64n_rs_u_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = double;
+
+  using Symm = cutlass::gemm::device::Symm<
+    ElementA,
+    LayoutA,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kUpper,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/symm_f64n_f64t_tensor_op_f64_ls_sm80.cu b/test/unit/gemm/device/symm_f64n_f64t_tensor_op_f64_ls_sm80.cu
new file mode 100644
index 00000000..69f03848
--- /dev/null
+++ b/test/unit/gemm/device/symm_f64n_f64t_tensor_op_f64_ls_sm80.cu
@@ -0,0 +1,258 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide SYMM interface
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/symm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/symm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_symm_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f64n_f64t_ls_l_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = double;
+
+  using Symm = cutlass::gemm::device::Symm<
+    ElementA,
+    LayoutA,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f64n_f64t_ls_l_tensor_op_f64, 64x64x16_32x32x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = double;
+
+  using Symm = cutlass::gemm::device::Symm<
+    ElementA,
+    LayoutA,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f64n_f64t_ls_l_tensor_op_f64, 128x64x16_64x32x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = double;
+
+  using Symm = cutlass::gemm::device::Symm<
+    ElementA,
+    LayoutA,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 16>,
+    cutlass::gemm::GemmShape<64, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f64n_f64t_ls_l_tensor_op_f64, 128x128x16_32x64x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = double;
+
+  using Symm = cutlass::gemm::device::Symm<
+    ElementA,
+    LayoutA,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f64n_f64t_ls_u_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = double;
+
+  using Symm = cutlass::gemm::device::Symm<
+    ElementA,
+    LayoutA,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/symm_f64n_f64t_tensor_op_f64_rs_sm80.cu b/test/unit/gemm/device/symm_f64n_f64t_tensor_op_f64_rs_sm80.cu
new file mode 100644
index 00000000..a1fcbf1d
--- /dev/null
+++ b/test/unit/gemm/device/symm_f64n_f64t_tensor_op_f64_rs_sm80.cu
@@ -0,0 +1,258 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide SYMM interface
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/symm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/symm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_symm_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f64n_f64t_rs_l_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = double;
+
+  using Symm = cutlass::gemm::device::Symm<
+    ElementA,
+    LayoutA,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kLower,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f64n_f64t_rs_l_tensor_op_f64, 64x64x16_32x32x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = double;
+
+  using Symm = cutlass::gemm::device::Symm<
+    ElementA,
+    LayoutA,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kLower,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f64n_f64t_rs_l_tensor_op_f64, 128x64x16_64x32x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = double;
+
+  using Symm = cutlass::gemm::device::Symm<
+    ElementA,
+    LayoutA,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kLower,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 16>,
+    cutlass::gemm::GemmShape<64, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f64n_f64t_rs_l_tensor_op_f64, 128x128x16_32x64x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = double;
+
+  using Symm = cutlass::gemm::device::Symm<
+    ElementA,
+    LayoutA,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kLower,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f64n_f64t_rs_u_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = double;
+
+  using Symm = cutlass::gemm::device::Symm<
+    ElementA,
+    LayoutA,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kUpper,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/symm_f64t_f64n_tensor_op_f64_ls_sm80.cu b/test/unit/gemm/device/symm_f64t_f64n_tensor_op_f64_ls_sm80.cu
new file mode 100644
index 00000000..aacf1460
--- /dev/null
+++ b/test/unit/gemm/device/symm_f64t_f64n_tensor_op_f64_ls_sm80.cu
@@ -0,0 +1,258 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide SYMM interface
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/symm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/symm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_symm_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f64t_f64n_ls_l_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = double;
+
+  using Symm = cutlass::gemm::device::Symm<
+    ElementA,
+    LayoutA,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f64t_f64n_ls_l_tensor_op_f64, 64x64x16_32x32x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = double;
+
+  using Symm = cutlass::gemm::device::Symm<
+    ElementA,
+    LayoutA,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f64t_f64n_ls_l_tensor_op_f64, 128x64x16_64x32x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = double;
+
+  using Symm = cutlass::gemm::device::Symm<
+    ElementA,
+    LayoutA,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 16>,
+    cutlass::gemm::GemmShape<64, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f64t_f64n_ls_u_tensor_op_f64, 128x128x16_32x64x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = double;
+
+  using Symm = cutlass::gemm::device::Symm<
+    ElementA,
+    LayoutA,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f64t_f64n_ls_u_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = double;
+
+  using Symm = cutlass::gemm::device::Symm<
+    ElementA,
+    LayoutA,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/symm_f64t_f64n_tensor_op_f64_rs_sm80.cu b/test/unit/gemm/device/symm_f64t_f64n_tensor_op_f64_rs_sm80.cu
new file mode 100644
index 00000000..0e00311f
--- /dev/null
+++ b/test/unit/gemm/device/symm_f64t_f64n_tensor_op_f64_rs_sm80.cu
@@ -0,0 +1,258 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide SYMM interface
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/symm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/symm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_symm_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f64t_f64n_rs_l_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = double;
+
+  using Symm = cutlass::gemm::device::Symm<
+    ElementA,
+    LayoutA,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kLower,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f64t_f64n_rs_l_tensor_op_f64, 64x64x16_32x32x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = double;
+
+  using Symm = cutlass::gemm::device::Symm<
+    ElementA,
+    LayoutA,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kLower,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f64t_f64n_rs_l_tensor_op_f64, 128x64x16_64x32x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = double;
+
+  using Symm = cutlass::gemm::device::Symm<
+    ElementA,
+    LayoutA,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kLower,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 16>,
+    cutlass::gemm::GemmShape<64, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f64t_f64n_rs_u_tensor_op_f64, 128x128x16_32x64x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = double;
+
+  using Symm = cutlass::gemm::device::Symm<
+    ElementA,
+    LayoutA,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kUpper,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f64t_f64n_rs_u_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = double;
+
+  using Symm = cutlass::gemm::device::Symm<
+    ElementA,
+    LayoutA,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kUpper,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/symm_f64t_f64t_tensor_op_f64_ls_sm80.cu b/test/unit/gemm/device/symm_f64t_f64t_tensor_op_f64_ls_sm80.cu
new file mode 100644
index 00000000..0dcd8ae5
--- /dev/null
+++ b/test/unit/gemm/device/symm_f64t_f64t_tensor_op_f64_ls_sm80.cu
@@ -0,0 +1,258 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide SYMM interface
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/symm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/symm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_symm_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f64t_f64t_ls_l_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = double;
+
+  using Symm = cutlass::gemm::device::Symm<
+    ElementA,
+    LayoutA,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f64t_f64t_ls_l_tensor_op_f64, 64x64x16_32x32x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = double;
+
+  using Symm = cutlass::gemm::device::Symm<
+    ElementA,
+    LayoutA,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f64t_f64t_ls_l_tensor_op_f64, 128x64x16_64x32x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = double;
+
+  using Symm = cutlass::gemm::device::Symm<
+    ElementA,
+    LayoutA,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 16>,
+    cutlass::gemm::GemmShape<64, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f64t_f64t_ls_l_tensor_op_f64, 128x128x16_32x64x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = double;
+
+  using Symm = cutlass::gemm::device::Symm<
+    ElementA,
+    LayoutA,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f64t_f64t_ls_u_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = double;
+
+  using Symm = cutlass::gemm::device::Symm<
+    ElementA,
+    LayoutA,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/symm_f64t_f64t_tensor_op_f64_rs_sm80.cu b/test/unit/gemm/device/symm_f64t_f64t_tensor_op_f64_rs_sm80.cu
new file mode 100644
index 00000000..ddef3a70
--- /dev/null
+++ b/test/unit/gemm/device/symm_f64t_f64t_tensor_op_f64_rs_sm80.cu
@@ -0,0 +1,258 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide SYMM interface
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/symm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/symm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_symm_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f64t_f64t_rs_l_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = double;
+
+  using Symm = cutlass::gemm::device::Symm<
+    ElementA,
+    LayoutA,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kLower,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f64t_f64t_rs_l_tensor_op_f64, 64x64x16_32x32x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = double;
+
+  using Symm = cutlass::gemm::device::Symm<
+    ElementA,
+    LayoutA,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kLower,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f64t_f64t_rs_l_tensor_op_f64, 128x64x16_64x32x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = double;
+
+  using Symm = cutlass::gemm::device::Symm<
+    ElementA,
+    LayoutA,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kLower,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 16>,
+    cutlass::gemm::GemmShape<64, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f64t_f64t_rs_l_tensor_op_f64, 128x128x16_32x64x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = double;
+
+  using Symm = cutlass::gemm::device::Symm<
+    ElementA,
+    LayoutA,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kLower,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f64t_f64t_rs_u_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = double;
+
+  using Symm = cutlass::gemm::device::Symm<
+    ElementA,
+    LayoutA,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kUpper,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/symm_tf32n_f32n_tensor_op_f32_ls_sm80.cu b/test/unit/gemm/device/symm_tf32n_f32n_tensor_op_f32_ls_sm80.cu
new file mode 100644
index 00000000..d0d29dff
--- /dev/null
+++ b/test/unit/gemm/device/symm_tf32n_f32n_tensor_op_f32_ls_sm80.cu
@@ -0,0 +1,489 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide SYMM interface
+
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/symm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/symm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_symm_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////Test name//////////////////////////////////////////////////
+//                             
+// SM80_Device_Symm_{ElementA/B}{LayoutA/B}_{ElementC}{LayoutC}_{SideMode}_{FillMode}\
+//     _tensor_op_{ElementAccumulator}_align{AlignmentA}_align{AlignmentB}
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Symm_f32n_f32n_ls_l_tensor_op_f32_align1_align1, 64x64x32_32x32x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Symm = cutlass::gemm::device::Symm<
+    float,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    float,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<32, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAdd
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f32n_f32n_ls_l_tensor_op_f32_align1_align4, 64x64x32_32x32x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Symm = cutlass::gemm::device::Symm<
+    float,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    float,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<32, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAdd
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f32n_f32n_ls_l_tensor_op_f32_align1_align4, 128x128x32_64x64x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Symm = cutlass::gemm::device::Symm<
+    float,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    float,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAdd
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f32n_f32n_ls_l_tensor_op_f32_align1_align4, 256x128x16_64x64x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Symm = cutlass::gemm::device::Symm<
+    float,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    float,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 128, 16>,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAdd
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f32n_f32n_ls_l_tensor_op_f32_align1_align4, 128x256x32_64x64x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Symm = cutlass::gemm::device::Symm<
+    float,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    float,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 256, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAdd
+  >;
+  
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f32n_f32n_ls_l_tensor_op_f32_align1_align4, 256x128x32_64x64x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Symm = cutlass::gemm::device::Symm<
+    float,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    float,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAdd
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f32n_f32n_ls_u_tensor_op_f32_align1_align4, 64x64x16_32x32x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Symm = cutlass::gemm::device::Symm<
+    float,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    float,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    10,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAdd
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f32n_f32n_ls_u_tensor_op_f32_align1_align4, 128x128x16_64x64x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Symm = cutlass::gemm::device::Symm<
+    float,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    float,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAdd
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f32n_f32n_ls_u_tensor_op_f32_align1_align4, 256x128x16_128x64x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Symm = cutlass::gemm::device::Symm<
+    float,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    float,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 128, 16>,
+    cutlass::gemm::GemmShape<128, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAdd
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f32n_f32n_ls_u_tensor_op_f32_align1_align4, 128x256x16_64x128x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Symm = cutlass::gemm::device::Symm<
+    float,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    float,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 256, 16>,
+    cutlass::gemm::GemmShape<64, 128, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAdd
+  >;
+  
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f32n_f32n_ls_u_tensor_op_f32_align1_align4, 256x256x16_64x128x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Symm = cutlass::gemm::device::Symm<
+    float,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    float,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 256, 16>,
+    cutlass::gemm::GemmShape<64, 128, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAdd
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/symm_tf32n_f32n_tensor_op_f32_rs_sm80.cu b/test/unit/gemm/device/symm_tf32n_f32n_tensor_op_f32_rs_sm80.cu
new file mode 100644
index 00000000..68f0223a
--- /dev/null
+++ b/test/unit/gemm/device/symm_tf32n_f32n_tensor_op_f32_rs_sm80.cu
@@ -0,0 +1,276 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide SYMM interface
+
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/symm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/symm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_symm_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////Test name//////////////////////////////////////////////////
+//                             
+// SM80_Device_Symm_{ElementA/B}{LayoutA/B}_{ElementC}{LayoutC}_{SideMode}_{FillMode}\
+//     _tensor_op_{ElementAccumulator}_align{AlignmentA}_align{AlignmentB}
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f32n_f32n_rs_u_tensor_op_f32_align1_align1, 64x128x32_32x64x32) {
+
+using Symm = cutlass::gemm::device::Symm<
+    float,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight, 
+    cutlass::FillMode::kUpper,
+    float,
+    cutlass::layout::ColumnMajor,
+    float,
+    cutlass::layout::ColumnMajor,
+    float,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 32>,
+    cutlass::gemm::GemmShape<32, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      float,
+      1,
+      float,
+      float
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>,
+    3,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAdd
+>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f32n_f32n_rs_u_tensor_op_f32_align1_align1, 128x64x32_32x64x32) {
+
+using Symm = cutlass::gemm::device::Symm<
+    float,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight, 
+    cutlass::FillMode::kUpper,
+    float,
+    cutlass::layout::ColumnMajor,
+    float,
+    cutlass::layout::ColumnMajor,
+    float,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 32>,
+    cutlass::gemm::GemmShape<32, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      float,
+      1,
+      float,
+      float
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>,
+    3,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAdd
+>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f32n_f32n_rs_l_tensor_op_f32_align1_align1, 64x128x32_32x64x32) {
+
+using Symm = cutlass::gemm::device::Symm<
+    float,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight, 
+    cutlass::FillMode::kLower,
+    float,
+    cutlass::layout::ColumnMajor,
+    float,
+    cutlass::layout::ColumnMajor,
+    float,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 32>,
+    cutlass::gemm::GemmShape<32, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      float,
+      1,
+      float,
+      float
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>,
+    3,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAdd
+>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+TEST(SM80_Device_Symm_f32n_f32n_rs_u_tensor_op_f32_align1_align4, 64x128x32_32x64x32) {
+
+using Symm = cutlass::gemm::device::Symm<
+    float,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight, 
+    cutlass::FillMode::kUpper,
+    float,
+    cutlass::layout::ColumnMajor,
+    float,
+    cutlass::layout::ColumnMajor,
+    float,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 32>,
+    cutlass::gemm::GemmShape<32, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      float,
+      1,
+      float,
+      float
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAdd
+>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f32n_f32n_rs_u_tensor_op_f32_align1_align4, 128x64x32_32x64x32) {
+
+using Symm = cutlass::gemm::device::Symm<
+    float,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight, 
+    cutlass::FillMode::kUpper,
+    float,
+    cutlass::layout::ColumnMajor,
+    float,
+    cutlass::layout::ColumnMajor,
+    float,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 32>,
+    cutlass::gemm::GemmShape<32, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      float,
+      1,
+      float,
+      float
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAdd
+>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f32n_f32n_rs_l_tensor_op_f32_align1_align4, 64x128x32_32x64x32) {
+
+using Symm = cutlass::gemm::device::Symm<
+    float,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight, 
+    cutlass::FillMode::kLower,
+    float,
+    cutlass::layout::ColumnMajor,
+    float,
+    cutlass::layout::ColumnMajor,
+    float,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 32>,
+    cutlass::gemm::GemmShape<32, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      float,
+      1,
+      float,
+      float
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAdd
+>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/symm_tf32t_f32t_tensor_op_f32_ls_sm80.cu b/test/unit/gemm/device/symm_tf32t_f32t_tensor_op_f32_ls_sm80.cu
new file mode 100644
index 00000000..70b3ce61
--- /dev/null
+++ b/test/unit/gemm/device/symm_tf32t_f32t_tensor_op_f32_ls_sm80.cu
@@ -0,0 +1,489 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide SYMM interface
+
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/symm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/symm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_symm_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////Test name//////////////////////////////////////////////////
+//                             
+// SM80_Device_Symm_{ElementA/B}{LayoutA/B}_{ElementC}{LayoutC}_{SideMode}_{FillMode}\
+//     _tensor_op_{ElementAccumulator}_align{AlignmentA}_align{AlignmentB}
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Symm_f32t_f32t_ls_l_tensor_op_f32_align1_align1, 64x64x32_32x32x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Symm = cutlass::gemm::device::Symm<
+    float,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    float,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<32, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAdd
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f32t_f32t_ls_l_tensor_op_f32_align1_align4, 64x64x32_32x32x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Symm = cutlass::gemm::device::Symm<
+    float,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    float,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<32, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAdd
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f32t_f32t_ls_l_tensor_op_f32_align1_align4, 128x128x32_64x64x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Symm = cutlass::gemm::device::Symm<
+    float,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    float,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAdd
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f32t_f32t_ls_l_tensor_op_f32_align1_align4, 256x128x16_64x64x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Symm = cutlass::gemm::device::Symm<
+    float,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    float,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 128, 16>,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAdd
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f32t_f32t_ls_l_tensor_op_f32_align1_align4, 128x256x32_64x64x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Symm = cutlass::gemm::device::Symm<
+    float,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    float,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 256, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAdd
+  >;
+  
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f32t_f32t_ls_l_tensor_op_f32_align1_align4, 256x128x32_64x64x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Symm = cutlass::gemm::device::Symm<
+    float,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    float,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAdd
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f32t_f32t_ls_u_tensor_op_f32_align1_align4, 64x64x16_32x32x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Symm = cutlass::gemm::device::Symm<
+    float,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    float,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    10,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAdd
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f32t_f32t_ls_u_tensor_op_f32_align1_align4, 128x128x16_64x64x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Symm = cutlass::gemm::device::Symm<
+    float,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    float,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAdd
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f32t_f32t_ls_u_tensor_op_f32_align1_align4, 256x128x16_128x64x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Symm = cutlass::gemm::device::Symm<
+    float,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    float,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 128, 16>,
+    cutlass::gemm::GemmShape<128, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAdd
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f32t_f32t_ls_u_tensor_op_f32_align1_align4, 128x256x16_64x128x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Symm = cutlass::gemm::device::Symm<
+    float,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    float,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 256, 16>,
+    cutlass::gemm::GemmShape<64, 128, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAdd
+  >;
+  
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Symm_f32t_f32t_ls_u_tensor_op_f32_align1_align4, 256x256x16_64x128x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Symm = cutlass::gemm::device::Symm<
+    float,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    float,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 256, 16>,
+    cutlass::gemm::GemmShape<64, 128, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAdd
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllSymmUniversal<Symm>());
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/syr2k_cf32n_cf32n_tensor_op_f32_sm80.cu b/test/unit/gemm/device/syr2k_cf32n_cf32n_tensor_op_f32_sm80.cu
new file mode 100644
index 00000000..2a8f0fe6
--- /dev/null
+++ b/test/unit/gemm/device/syr2k_cf32n_cf32n_tensor_op_f32_sm80.cu
@@ -0,0 +1,150 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide SYRK interface
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/rank_2k.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/rank_2k.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_rank2k_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2k_cf32n_cf32n_l_tensor_op_f32, 64x64x16_32x32x16) {
+
+  using ElementA = cutlass::complex<float>;
+  using LayoutA = cutlass::layout::ColumnMajor;
+
+  using ElementB = cutlass::complex<float>;
+  using LayoutB = cutlass::layout::ColumnMajor;
+
+  using ElementC = cutlass::complex<float>;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using Rank2K = cutlass::gemm::device::Rank2K<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,     // kStages 
+    1,     // AlignmentA
+    1,     // AlignmentB
+    false, // SplitKSerial
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone,
+    cutlass::BlasMode::kSymmetric
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRank2KUniversal<Rank2K>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2k_cf32n_cf32n_u_tensor_op_f32, 64x64x16_32x32x16) {
+
+  using ElementA = cutlass::complex<float>;
+  using LayoutA = cutlass::layout::ColumnMajor;
+
+  using ElementB = cutlass::complex<float>;
+  using LayoutB = cutlass::layout::ColumnMajor;
+
+  using ElementC = cutlass::complex<float>;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using Rank2K = cutlass::gemm::device::Rank2K<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,     // kStages 
+    1,     // AlignmentA
+    1,     // AlignmentB
+    false, // SplitKSerial
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone,
+    cutlass::BlasMode::kSymmetric
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRank2KUniversal<Rank2K>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/syr2k_cf32n_cf32n_tensor_op_fast_f32_sm80.cu b/test/unit/gemm/device/syr2k_cf32n_cf32n_tensor_op_fast_f32_sm80.cu
new file mode 100644
index 00000000..d642d679
--- /dev/null
+++ b/test/unit/gemm/device/syr2k_cf32n_cf32n_tensor_op_fast_f32_sm80.cu
@@ -0,0 +1,150 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide SYRK interface
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/rank_2k.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/rank_2k.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_rank2k_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2k_cf32n_cf32n_l_tensor_op_fast_f32, 64x64x16_32x32x16) {
+
+  using ElementA = cutlass::complex<float>;
+  using LayoutA = cutlass::layout::ColumnMajor;
+
+  using ElementB = cutlass::complex<float>;
+  using LayoutB = cutlass::layout::ColumnMajor;
+
+  using ElementC = cutlass::complex<float>;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using Rank2K = cutlass::gemm::device::Rank2K<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,     // kStages 
+    1,     // AlignmentA
+    1,     // AlignmentB
+    false, // SplitKSerial
+    cutlass::arch::OpMultiplyAddComplexFastF32,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone,
+    cutlass::BlasMode::kSymmetric
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRank2KUniversal<Rank2K>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2k_cf32n_cf32n_u_tensor_op_fast_f32, 64x64x16_32x32x16) {
+
+  using ElementA = cutlass::complex<float>;
+  using LayoutA = cutlass::layout::ColumnMajor;
+
+  using ElementB = cutlass::complex<float>;
+  using LayoutB = cutlass::layout::ColumnMajor;
+
+  using ElementC = cutlass::complex<float>;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using Rank2K = cutlass::gemm::device::Rank2K<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,     // kStages 
+    1,     // AlignmentA
+    1,     // AlignmentB
+    false, // SplitKSerial
+    cutlass::arch::OpMultiplyAddComplexFastF32,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone,
+    cutlass::BlasMode::kSymmetric
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRank2KUniversal<Rank2K>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/syr2k_cf32n_cf32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/syr2k_cf32n_cf32t_tensor_op_f32_sm80.cu
new file mode 100644
index 00000000..483f768f
--- /dev/null
+++ b/test/unit/gemm/device/syr2k_cf32n_cf32t_tensor_op_f32_sm80.cu
@@ -0,0 +1,150 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide SYRK interface
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/rank_2k.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/rank_2k.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_rank2k_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2k_cf32n_cf32t_l_tensor_op_f32, 64x64x16_32x32x16) {
+
+  using ElementA = cutlass::complex<float>;
+  using LayoutA = cutlass::layout::ColumnMajor;
+
+  using ElementB = cutlass::complex<float>;
+  using LayoutB = cutlass::layout::ColumnMajor;
+
+  using ElementC = cutlass::complex<float>;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using Rank2K = cutlass::gemm::device::Rank2K<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,     // kStages 
+    1,     // AlignmentA
+    1,     // AlignmentB
+    false, // SplitKSerial
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone,
+    cutlass::BlasMode::kSymmetric
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRank2KUniversal<Rank2K>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2k_cf32n_cf32t_u_tensor_op_f32, 64x64x16_32x32x16) {
+
+  using ElementA = cutlass::complex<float>;
+  using LayoutA = cutlass::layout::ColumnMajor;
+
+  using ElementB = cutlass::complex<float>;
+  using LayoutB = cutlass::layout::ColumnMajor;
+
+  using ElementC = cutlass::complex<float>;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using Rank2K = cutlass::gemm::device::Rank2K<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,     // kStages 
+    1,     // AlignmentA
+    1,     // AlignmentB
+    false, // SplitKSerial
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone,
+    cutlass::BlasMode::kSymmetric
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRank2KUniversal<Rank2K>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/syr2k_cf32n_cf32t_tensor_op_fast_f32_sm80.cu b/test/unit/gemm/device/syr2k_cf32n_cf32t_tensor_op_fast_f32_sm80.cu
new file mode 100644
index 00000000..b7b610aa
--- /dev/null
+++ b/test/unit/gemm/device/syr2k_cf32n_cf32t_tensor_op_fast_f32_sm80.cu
@@ -0,0 +1,150 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide SYRK interface
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/rank_2k.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/rank_2k.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_rank2k_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2k_cf32n_cf32t_l_tensor_op_fast_f32, 64x64x16_32x32x16) {
+
+  using ElementA = cutlass::complex<float>;
+  using LayoutA = cutlass::layout::ColumnMajor;
+
+  using ElementB = cutlass::complex<float>;
+  using LayoutB = cutlass::layout::ColumnMajor;
+
+  using ElementC = cutlass::complex<float>;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using Rank2K = cutlass::gemm::device::Rank2K<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,     // kStages 
+    1,     // AlignmentA
+    1,     // AlignmentB
+    false, // SplitKSerial
+    cutlass::arch::OpMultiplyAddComplexFastF32,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone,
+    cutlass::BlasMode::kSymmetric
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRank2KUniversal<Rank2K>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2k_cf32n_cf32t_u_tensor_op_fast_f32, 64x64x16_32x32x16) {
+
+  using ElementA = cutlass::complex<float>;
+  using LayoutA = cutlass::layout::ColumnMajor;
+
+  using ElementB = cutlass::complex<float>;
+  using LayoutB = cutlass::layout::ColumnMajor;
+
+  using ElementC = cutlass::complex<float>;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using Rank2K = cutlass::gemm::device::Rank2K<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,     // kStages 
+    1,     // AlignmentA
+    1,     // AlignmentB
+    false, // SplitKSerial
+    cutlass::arch::OpMultiplyAddComplexFastF32,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone,
+    cutlass::BlasMode::kSymmetric
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRank2KUniversal<Rank2K>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/syr2k_cf64n_cf64n_tensor_op_f64_sm80.cu b/test/unit/gemm/device/syr2k_cf64n_cf64n_tensor_op_f64_sm80.cu
new file mode 100644
index 00000000..fa2a5368
--- /dev/null
+++ b/test/unit/gemm/device/syr2k_cf64n_cf64n_tensor_op_f64_sm80.cu
@@ -0,0 +1,150 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide SYRK interface
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/rank_2k.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/rank_2k.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_rank2k_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2k_cf64n_cf64n_l_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = cutlass::complex<double>;
+  using LayoutA = cutlass::layout::ColumnMajor;
+
+  using ElementB = cutlass::complex<double>;
+  using LayoutB = cutlass::layout::ColumnMajor;
+
+  using ElementC = cutlass::complex<double>;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Rank2K = cutlass::gemm::device::Rank2K<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,     // kStages 
+    1,     // AlignmentA
+    1,     // AlignmentB
+    false, // SplitKSerial
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone,
+    cutlass::BlasMode::kSymmetric
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRank2KUniversal<Rank2K>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2k_cf64n_cf64n_u_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = cutlass::complex<double>;
+  using LayoutA = cutlass::layout::ColumnMajor;
+
+  using ElementB = cutlass::complex<double>;
+  using LayoutB = cutlass::layout::ColumnMajor;
+
+  using ElementC = cutlass::complex<double>;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Rank2K = cutlass::gemm::device::Rank2K<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,     // kStages 
+    1,     // AlignmentA
+    1,     // AlignmentB
+    false, // SplitKSerial
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone,
+    cutlass::BlasMode::kSymmetric
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRank2KUniversal<Rank2K>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/syr2k_cf64n_cf64t_tensor_op_f64_sm80.cu b/test/unit/gemm/device/syr2k_cf64n_cf64t_tensor_op_f64_sm80.cu
new file mode 100644
index 00000000..ff4a61d0
--- /dev/null
+++ b/test/unit/gemm/device/syr2k_cf64n_cf64t_tensor_op_f64_sm80.cu
@@ -0,0 +1,150 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide SYRK interface
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/rank_2k.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/rank_2k.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_rank2k_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2k_cf64n_cf64t_l_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = cutlass::complex<double>;
+  using LayoutA = cutlass::layout::ColumnMajor;
+
+  using ElementB = cutlass::complex<double>;
+  using LayoutB = cutlass::layout::ColumnMajor;
+
+  using ElementC = cutlass::complex<double>;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Rank2K = cutlass::gemm::device::Rank2K<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,     // kStages 
+    1,     // AlignmentA
+    1,     // AlignmentB
+    false, // SplitKSerial
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone,
+    cutlass::BlasMode::kSymmetric
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRank2KUniversal<Rank2K>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2k_cf64n_cf64t_u_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = cutlass::complex<double>;
+  using LayoutA = cutlass::layout::ColumnMajor;
+
+  using ElementB = cutlass::complex<double>;
+  using LayoutB = cutlass::layout::ColumnMajor;
+
+  using ElementC = cutlass::complex<double>;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Rank2K = cutlass::gemm::device::Rank2K<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,     // kStages 
+    1,     // AlignmentA
+    1,     // AlignmentB
+    false, // SplitKSerial
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::ComplexTransform::kNone,
+    cutlass::ComplexTransform::kNone,
+    cutlass::BlasMode::kSymmetric
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRank2KUniversal<Rank2K>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/syr2k_f32n_f32n_tensor_op_fast_f32_sm80.cu b/test/unit/gemm/device/syr2k_f32n_f32n_tensor_op_fast_f32_sm80.cu
new file mode 100644
index 00000000..a341b35e
--- /dev/null
+++ b/test/unit/gemm/device/syr2k_f32n_f32n_tensor_op_fast_f32_sm80.cu
@@ -0,0 +1,132 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide SYRK interface
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/rank_2k.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/rank_2k.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_rank2k_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2k_f32n_f32n_l_tensor_op_fast_f32, 128x256x32_64x64x32) {
+
+  using ElementA = cutlass::tfloat32_t;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::tfloat32_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = float;
+
+  using Rank2K = cutlass::gemm::device::Rank2K<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 256, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRank2KUniversal<Rank2K>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2k_f32n_f32n_u_tensor_op_fast_f32, 128x256x32_64x64x32) {
+
+  using ElementA = cutlass::tfloat32_t;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::tfloat32_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = float;
+
+  using Rank2K = cutlass::gemm::device::Rank2K<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 256, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRank2KUniversal<Rank2K>());
+
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/syr2k_f32t_f32n_tensor_op_fast_f32_sm80.cu b/test/unit/gemm/device/syr2k_f32t_f32n_tensor_op_fast_f32_sm80.cu
new file mode 100644
index 00000000..9c346de2
--- /dev/null
+++ b/test/unit/gemm/device/syr2k_f32t_f32n_tensor_op_fast_f32_sm80.cu
@@ -0,0 +1,133 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide SYRK interface
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/rank_2k.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/rank_2k.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_rank2k_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2k_f32t_f32n_l_tensor_op_fast_f32, 128x256x32_64x64x32) {
+
+  using ElementA = cutlass::tfloat32_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::tfloat32_t;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = float;
+
+  using Rank2K = cutlass::gemm::device::Rank2K<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 256, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRank2KUniversal<Rank2K>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2k_f32t_f32n_u_tensor_op_fast_f32, 128x256x32_64x64x32) {
+
+  using ElementA = cutlass::tfloat32_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::tfloat32_t;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = float;
+
+  using Rank2K = cutlass::gemm::device::Rank2K<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 256, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRank2KUniversal<Rank2K>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/syr2k_f64n_f64n_tensor_op_f64_sm80.cu b/test/unit/gemm/device/syr2k_f64n_f64n_tensor_op_f64_sm80.cu
new file mode 100644
index 00000000..447169e0
--- /dev/null
+++ b/test/unit/gemm/device/syr2k_f64n_f64n_tensor_op_f64_sm80.cu
@@ -0,0 +1,253 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide SYRK interface
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/rank_2k.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/rank_2k.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_rank2k_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2k_f64n_f64n_l_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = double;
+
+  using Rank2K = cutlass::gemm::device::Rank2K<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRank2KUniversal<Rank2K>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2k_f64n_f64n_l_tensor_op_f64, 64x64x16_32x32x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = double;
+
+  using Rank2K = cutlass::gemm::device::Rank2K<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRank2KUniversal<Rank2K>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2k_f64n_f64n_l_tensor_op_f64, 128x64x16_64x32x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = double;
+
+  using Rank2K = cutlass::gemm::device::Rank2K<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 16>,
+    cutlass::gemm::GemmShape<64, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRank2KUniversal<Rank2K>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2k_f64n_f64n_l_tensor_op_f64, 128x128x16_32x64x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = double;
+
+  using Rank2K = cutlass::gemm::device::Rank2K<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRank2KUniversal<Rank2K>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2k_f64n_f64n_u_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = double;
+
+  using Rank2K = cutlass::gemm::device::Rank2K<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRank2KUniversal<Rank2K>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/syr2k_f64n_f64t_tensor_op_f64_sm80.cu b/test/unit/gemm/device/syr2k_f64n_f64t_tensor_op_f64_sm80.cu
new file mode 100644
index 00000000..c9575080
--- /dev/null
+++ b/test/unit/gemm/device/syr2k_f64n_f64t_tensor_op_f64_sm80.cu
@@ -0,0 +1,253 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide SYRK interface
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/rank_2k.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/rank_2k.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_rank2k_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2k_f64n_f64t_l_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = double;
+
+  using Rank2K = cutlass::gemm::device::Rank2K<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRank2KUniversal<Rank2K>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2k_f64n_f64t_l_tensor_op_f64, 64x64x16_32x32x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = double;
+
+  using Rank2K = cutlass::gemm::device::Rank2K<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRank2KUniversal<Rank2K>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2k_f64n_f64t_l_tensor_op_f64, 128x64x16_64x32x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = double;
+
+  using Rank2K = cutlass::gemm::device::Rank2K<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 16>,
+    cutlass::gemm::GemmShape<64, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRank2KUniversal<Rank2K>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2k_f64n_f64t_l_tensor_op_f64, 128x128x16_32x64x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = double;
+
+  using Rank2K = cutlass::gemm::device::Rank2K<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRank2KUniversal<Rank2K>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2k_f64n_f64t_u_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = double;
+
+  using Rank2K = cutlass::gemm::device::Rank2K<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRank2KUniversal<Rank2K>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/syr2k_f64t_f64n_tensor_op_f64_sm80.cu b/test/unit/gemm/device/syr2k_f64t_f64n_tensor_op_f64_sm80.cu
new file mode 100644
index 00000000..0a7d11a9
--- /dev/null
+++ b/test/unit/gemm/device/syr2k_f64t_f64n_tensor_op_f64_sm80.cu
@@ -0,0 +1,253 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide SYRK interface
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/rank_2k.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/rank_2k.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_rank2k_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2k_f64t_f64n_l_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = double;
+
+  using Rank2K = cutlass::gemm::device::Rank2K<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRank2KUniversal<Rank2K>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2k_f64t_f64n_l_tensor_op_f64, 64x64x16_32x32x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = double;
+
+  using Rank2K = cutlass::gemm::device::Rank2K<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRank2KUniversal<Rank2K>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2k_f64t_f64n_l_tensor_op_f64, 128x64x16_64x32x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = double;
+
+  using Rank2K = cutlass::gemm::device::Rank2K<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 16>,
+    cutlass::gemm::GemmShape<64, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRank2KUniversal<Rank2K>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2k_f64t_f64n_l_tensor_op_f64, 128x128x16_32x64x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = double;
+
+  using Rank2K = cutlass::gemm::device::Rank2K<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRank2KUniversal<Rank2K>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2k_f64t_f64n_u_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = double;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = double;
+
+  using Rank2K = cutlass::gemm::device::Rank2K<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRank2KUniversal<Rank2K>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/syr2k_tf32n_f32n_tensor_op_f32_sm80.cu b/test/unit/gemm/device/syr2k_tf32n_f32n_tensor_op_f32_sm80.cu
new file mode 100644
index 00000000..3f30c97b
--- /dev/null
+++ b/test/unit/gemm/device/syr2k_tf32n_f32n_tensor_op_f32_sm80.cu
@@ -0,0 +1,132 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide SYRK interface
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/rank_2k.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/rank_2k.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_rank2k_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2k_tf32n_f32n_l_tensor_op_f32, 128x256x32_64x64x32) {
+
+  using ElementA = cutlass::tfloat32_t;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::tfloat32_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = float;
+
+  using Rank2K = cutlass::gemm::device::Rank2K<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 256, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRank2KUniversal<Rank2K>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2k_tf32n_f32n_u_tensor_op_f32, 128x256x32_64x64x32) {
+
+  using ElementA = cutlass::tfloat32_t;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::tfloat32_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = float;
+
+  using Rank2K = cutlass::gemm::device::Rank2K<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 256, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRank2KUniversal<Rank2K>());
+
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/syr2k_tf32t_f32n_tensor_op_f32_sm80.cu b/test/unit/gemm/device/syr2k_tf32t_f32n_tensor_op_f32_sm80.cu
new file mode 100644
index 00000000..6696ac4c
--- /dev/null
+++ b/test/unit/gemm/device/syr2k_tf32t_f32n_tensor_op_f32_sm80.cu
@@ -0,0 +1,133 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide SYRK interface
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/rank_2k.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/rank_2k.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_rank2k_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2k_tf32t_f32n_l_tensor_op_f32, 128x256x32_64x64x32) {
+
+  using ElementA = cutlass::tfloat32_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::tfloat32_t;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = float;
+
+  using Rank2K = cutlass::gemm::device::Rank2K<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 256, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRank2KUniversal<Rank2K>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syr2k_tf32t_f32n_u_tensor_op_f32, 128x256x32_64x64x32) {
+
+  using ElementA = cutlass::tfloat32_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::tfloat32_t;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = float;
+
+  using Rank2K = cutlass::gemm::device::Rank2K<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 256, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRank2KUniversal<Rank2K>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/syrk_cf32n_cf32n_tensor_op_f32_sm80.cu b/test/unit/gemm/device/syrk_cf32n_cf32n_tensor_op_f32_sm80.cu
new file mode 100644
index 00000000..8b0e8df4
--- /dev/null
+++ b/test/unit/gemm/device/syrk_cf32n_cf32n_tensor_op_f32_sm80.cu
@@ -0,0 +1,137 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide SYRK interface
+
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/rank_k.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/rank_k_complex.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_rank_k_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_cf32n_cf32n_l_tensor_op_f32, 32x32x16_16x16x16) {
+
+  using ElementA = cutlass::complex<float>;
+  using LayoutA = cutlass::layout::ColumnMajor;
+
+  using ElementC = cutlass::complex<float>;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,     // kStages 
+    1,     // AlignmentA
+    false, // SplitKSerial
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::ComplexTransform::kNone,
+    cutlass::BlasMode::kSymmetric
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_cf32n_cf32n_u_tensor_op_f32, 32x32x16_16x16x16) {
+
+  using ElementA = cutlass::complex<float>;
+  using LayoutA = cutlass::layout::ColumnMajor;
+
+  using ElementC = cutlass::complex<float>;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,     // kStages 
+    1,     // AlignmentA
+    false, // SplitKSerial
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::ComplexTransform::kNone,
+    cutlass::BlasMode::kSymmetric
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/syrk_cf32n_cf32n_tensor_op_fast_f32_sm80.cu b/test/unit/gemm/device/syrk_cf32n_cf32n_tensor_op_fast_f32_sm80.cu
new file mode 100644
index 00000000..179308ec
--- /dev/null
+++ b/test/unit/gemm/device/syrk_cf32n_cf32n_tensor_op_fast_f32_sm80.cu
@@ -0,0 +1,137 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide SYRK interface
+
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/rank_k.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/rank_k_complex.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_rank_k_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_cf32n_cf32n_l_tensor_op_fast_f32, 32x32x16_16x16x16) {
+
+  using ElementA = cutlass::complex<float>;
+  using LayoutA = cutlass::layout::ColumnMajor;
+
+  using ElementC = cutlass::complex<float>;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,     // kStages 
+    1,     // AlignmentA
+    false, // SplitKSerial
+    cutlass::arch::OpMultiplyAddComplexFastF32,
+    cutlass::ComplexTransform::kNone,
+    cutlass::BlasMode::kSymmetric
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_cf32n_cf32n_u_tensor_op_fast_f32, 32x32x16_16x16x16) {
+
+  using ElementA = cutlass::complex<float>;
+  using LayoutA = cutlass::layout::ColumnMajor;
+
+  using ElementC = cutlass::complex<float>;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,     // kStages 
+    1,     // AlignmentA
+    false, // SplitKSerial
+    cutlass::arch::OpMultiplyAddComplexFastF32,
+    cutlass::ComplexTransform::kNone,
+    cutlass::BlasMode::kSymmetric
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/syrk_cf32n_cf32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/syrk_cf32n_cf32t_tensor_op_f32_sm80.cu
new file mode 100644
index 00000000..c0199570
--- /dev/null
+++ b/test/unit/gemm/device/syrk_cf32n_cf32t_tensor_op_f32_sm80.cu
@@ -0,0 +1,137 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide SYRK interface
+
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/rank_k.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/rank_k_complex.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_rank_k_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_cf32n_cf32t_l_tensor_op_f32, 32x32x16_16x16x16) {
+
+  using ElementA = cutlass::complex<float>;
+  using LayoutA = cutlass::layout::ColumnMajor;
+
+  using ElementC = cutlass::complex<float>;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,     // kStages 
+    1,     // AlignmentA
+    false, // SplitKSerial
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::ComplexTransform::kNone,
+    cutlass::BlasMode::kSymmetric
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_cf32n_cf32t_u_tensor_op_f32, 32x32x16_16x16x16) {
+
+  using ElementA = cutlass::complex<float>;
+  using LayoutA = cutlass::layout::ColumnMajor;
+
+  using ElementC = cutlass::complex<float>;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,     // kStages 
+    1,     // AlignmentA
+    false, // SplitKSerial
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::ComplexTransform::kNone,
+    cutlass::BlasMode::kSymmetric
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/syrk_cf32n_cf32t_tensor_op_fast_f32_sm80.cu b/test/unit/gemm/device/syrk_cf32n_cf32t_tensor_op_fast_f32_sm80.cu
new file mode 100644
index 00000000..1be43d1a
--- /dev/null
+++ b/test/unit/gemm/device/syrk_cf32n_cf32t_tensor_op_fast_f32_sm80.cu
@@ -0,0 +1,137 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide SYRK interface
+
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/rank_k.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/rank_k_complex.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_rank_k_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_cf32n_cf32t_l_tensor_op_fast_f32, 32x32x16_16x16x16) {
+
+  using ElementA = cutlass::complex<float>;
+  using LayoutA = cutlass::layout::ColumnMajor;
+
+  using ElementC = cutlass::complex<float>;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,     // kStages 
+    1,     // AlignmentA
+    false, // SplitKSerial
+    cutlass::arch::OpMultiplyAddComplexFastF32,
+    cutlass::ComplexTransform::kNone,
+    cutlass::BlasMode::kSymmetric
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_cf32n_cf32t_u_tensor_op_fast_f32, 32x32x16_16x16x16) {
+
+  using ElementA = cutlass::complex<float>;
+  using LayoutA = cutlass::layout::ColumnMajor;
+
+  using ElementC = cutlass::complex<float>;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,     // kStages 
+    1,     // AlignmentA
+    false, // SplitKSerial
+    cutlass::arch::OpMultiplyAddComplexFastF32,
+    cutlass::ComplexTransform::kNone,
+    cutlass::BlasMode::kSymmetric
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/syrk_cf64n_cf64n_tensor_op_f64_sm80.cu b/test/unit/gemm/device/syrk_cf64n_cf64n_tensor_op_f64_sm80.cu
new file mode 100644
index 00000000..bdd54bd6
--- /dev/null
+++ b/test/unit/gemm/device/syrk_cf64n_cf64n_tensor_op_f64_sm80.cu
@@ -0,0 +1,136 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide SYRK interface
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/rank_k.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/rank_k_complex.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_rank_k_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_cf64n_cf64n_l_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = cutlass::complex<double>;
+  using LayoutA = cutlass::layout::ColumnMajor;
+
+  using ElementC = cutlass::complex<double>;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,     // kStages 
+    1,     // AlignmentA
+    false, // SplitKSerial
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::ComplexTransform::kNone,
+    cutlass::BlasMode::kSymmetric
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_cf64n_cf64n_u_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = cutlass::complex<double>;
+  using LayoutA = cutlass::layout::ColumnMajor;
+
+  using ElementC = cutlass::complex<double>;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,     // kStages 
+    1,     // AlignmentA
+    false, // SplitKSerial
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::ComplexTransform::kNone,
+    cutlass::BlasMode::kSymmetric
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/syrk_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu b/test/unit/gemm/device/syrk_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu
new file mode 100644
index 00000000..6f76e843
--- /dev/null
+++ b/test/unit/gemm/device/syrk_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu
@@ -0,0 +1,95 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide SYRK interface
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/rank_k.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/rank_k_complex.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_rank_k_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_cf64n_cf64t_l_tensor_op_f64_gaussian, 32x32x16_16x16x16) {
+
+  using ElementA = cutlass::complex<double>;
+  using LayoutA = cutlass::layout::ColumnMajor;
+
+  using ElementC = cutlass::complex<double>;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,     // kStages 
+    1,     // AlignmentA
+    false, // SplitKSerial
+    cutlass::arch::OpMultiplyAddGaussianComplex,
+    cutlass::ComplexTransform::kNone,
+    cutlass::BlasMode::kSymmetric
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/syrk_cf64n_cf64t_tensor_op_f64_sm80.cu b/test/unit/gemm/device/syrk_cf64n_cf64t_tensor_op_f64_sm80.cu
new file mode 100644
index 00000000..cca4e01c
--- /dev/null
+++ b/test/unit/gemm/device/syrk_cf64n_cf64t_tensor_op_f64_sm80.cu
@@ -0,0 +1,136 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide SYRK interface
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/rank_k.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/rank_k_complex.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_rank_k_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_cf64n_cf64t_l_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = cutlass::complex<double>;
+  using LayoutA = cutlass::layout::ColumnMajor;
+
+  using ElementC = cutlass::complex<double>;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,     // kStages 
+    1,     // AlignmentA
+    false, // SplitKSerial
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::ComplexTransform::kNone,
+    cutlass::BlasMode::kSymmetric
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_cf64n_cf64t_u_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = cutlass::complex<double>;
+  using LayoutA = cutlass::layout::ColumnMajor;
+
+  using ElementC = cutlass::complex<double>;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,     // kStages 
+    1,     // AlignmentA
+    false, // SplitKSerial
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::ComplexTransform::kNone,
+    cutlass::BlasMode::kSymmetric
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/syrk_f32n_f32t_tensor_op_fast_f32_sm80.cu b/test/unit/gemm/device/syrk_f32n_f32t_tensor_op_fast_f32_sm80.cu
new file mode 100644
index 00000000..5ce360b8
--- /dev/null
+++ b/test/unit/gemm/device/syrk_f32n_f32t_tensor_op_fast_f32_sm80.cu
@@ -0,0 +1,541 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide SYRK interface
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/rank_k.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/rank_k_complex.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_rank_k_universal.h"
+
+#if (CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_f32n_f32t_l_tensor_op_fast_f32, 128x256x32_64x64x32) {
+
+  using ElementA = float;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 256, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_f32n_f32t_l_tensor_op_fast_f32, 256x128x32_64x64x32) {
+
+  using ElementA = float;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_f32n_f32t_l_tensor_op_fast_f32, 64x256x32_64x64x32) {
+
+  using ElementA = float;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 256, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_f32n_f32t_l_tensor_op_fast_f32, 256x64x32_64x64x32) {
+
+  using ElementA = float;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 64, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_f32n_f32t_l_tensor_op_fast_f32, 128x128x32_64x64x32) {
+
+  using ElementA = float;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+TEST(SM80_Device_Syrk_f32n_f32t_l_tensor_op_fast_f32, 64x128x32_32x64x32) {
+
+  using ElementA = float;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 32>,
+    cutlass::gemm::GemmShape<32, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+TEST(SM80_Device_Syrk_f32n_f32t_l_tensor_op_fast_f32, 128x64x32_64x32x32) {
+
+  using ElementA = float;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<32, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_f32n_f32t_l_tensor_op_fast_f32, 128x128x16_64x64x16) {
+
+  using ElementA = float;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_f32n_f32t_l_tensor_op_fast_f32, 64x128x16_32x64x16) {
+
+  using ElementA = float;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 16>,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    6
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_f32n_f32t_u_tensor_op_fast_f32, 128x256x32_64x64x32) {
+
+  using ElementA = float;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 256, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_f32n_f32t_u_tensor_op_fast_f32, 256x128x32_64x64x32) {
+
+  using ElementA = float;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_f32n_f32t_u_tensor_op_fast_f32, 64x256x32_64x64x32) {
+
+  using ElementA = float;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 256, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_f32n_f32t_u_tensor_op_fast_f32, 256x64x32_64x64x32) {
+
+  using ElementA = float;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 64, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_f32n_f32t_u_tensor_op_fast_f32, 128x128x32_64x64x32) {
+
+  using ElementA = float;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // if (CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/syrk_f32t_f32t_tensor_op_fast_f32_sm80.cu b/test/unit/gemm/device/syrk_f32t_f32t_tensor_op_fast_f32_sm80.cu
new file mode 100644
index 00000000..27be2e0b
--- /dev/null
+++ b/test/unit/gemm/device/syrk_f32t_f32t_tensor_op_fast_f32_sm80.cu
@@ -0,0 +1,541 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide SYRK interface
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/rank_k.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/rank_k_complex.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_rank_k_universal.h"
+
+#if (CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_f32t_f32t_l_tensor_op_fast_f32, 128x256x32_64x64x32) {
+
+  using ElementA = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 256, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_f32t_f32t_l_tensor_op_fast_f32, 256x128x32_64x64x32) {
+
+  using ElementA = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_f32t_f32t_l_tensor_op_fast_f32, 64x256x32_64x64x32) {
+
+  using ElementA = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 256, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_f32t_f32t_l_tensor_op_fast_f32, 256x64x32_64x64x32) {
+
+  using ElementA = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 64, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_f32t_f32t_l_tensor_op_fast_f32, 128x128x32_64x64x32) {
+
+  using ElementA = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+TEST(SM80_Device_Syrk_f32t_f32t_l_tensor_op_fast_f32, 64x128x32_32x64x32) {
+
+  using ElementA = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 32>,
+    cutlass::gemm::GemmShape<32, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+TEST(SM80_Device_Syrk_f32t_f32t_l_tensor_op_fast_f32, 128x64x32_64x32x32) {
+
+  using ElementA = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<32, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_f32t_f32t_l_tensor_op_fast_f32, 128x128x16_64x64x16) {
+
+  using ElementA = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_f32t_f32t_l_tensor_op_fast_f32, 64x128x16_32x64x16) {
+
+  using ElementA = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 16>,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    6
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_f32t_f32t_u_tensor_op_fast_f32, 128x256x32_64x64x32) {
+
+  using ElementA = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 256, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_f32t_f32t_u_tensor_op_fast_f32, 256x128x32_64x64x32) {
+
+  using ElementA = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_f32t_f32t_u_tensor_op_fast_f32, 64x256x32_64x64x32) {
+
+  using ElementA = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 256, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_f32t_f32t_u_tensor_op_fast_f32, 256x64x32_64x64x32) {
+
+  using ElementA = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 64, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_f32t_f32t_u_tensor_op_fast_f32, 128x128x32_64x64x32) {
+
+  using ElementA = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // if (CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/syrk_f64n_f64t_tensor_op_f64_sm80.cu b/test/unit/gemm/device/syrk_f64n_f64t_tensor_op_f64_sm80.cu
new file mode 100644
index 00000000..dd2db3cd
--- /dev/null
+++ b/test/unit/gemm/device/syrk_f64n_f64t_tensor_op_f64_sm80.cu
@@ -0,0 +1,237 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide SYRK interface
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/rank_k.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/rank_k_complex.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_rank_k_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_f64n_f64t_l_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = double;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_f64n_f64t_l_tensor_op_f64, 64x64x16_32x32x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::ColumnMajor;
+
+  using ElementC = double;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = double;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_f64n_f64t_l_tensor_op_f64, 128x64x16_64x32x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::ColumnMajor;
+
+  using ElementC = double;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = double;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 16>,
+    cutlass::gemm::GemmShape<64, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_f64n_f64t_l_tensor_op_f64, 128x128x16_32x64x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::ColumnMajor;
+
+  using ElementC = double;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = double;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_f64n_f64t_u_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::ColumnMajor;
+
+  using ElementC = double;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = double;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/syrk_f64t_f64n_tensor_op_f64_sm80.cu b/test/unit/gemm/device/syrk_f64t_f64n_tensor_op_f64_sm80.cu
new file mode 100644
index 00000000..0686fd58
--- /dev/null
+++ b/test/unit/gemm/device/syrk_f64t_f64n_tensor_op_f64_sm80.cu
@@ -0,0 +1,301 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide SYRK interface
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/rank_k.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/rank_k_complex.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_rank_k_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_f64t_f64n_l_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = double;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_f64t_f64n_l_tensor_op_f64, 64x64x16_32x32x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = double;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_f64t_f64n_l_tensor_op_f64, 128x64x16_64x32x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = double;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 16>,
+    cutlass::gemm::GemmShape<64, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_f64t_f64n_l_tensor_op_f64, 128x128x16_32x64x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = double;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_f64t_f64n_u_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = double;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_f64t_f64n_u_tensor_op_f64, 128x64x16_64x32x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = double;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 16>,
+    cutlass::gemm::GemmShape<64, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_f64t_f64n_u_tensor_op_f64, 128x128x16_32x64x16) {
+
+  using ElementA = double;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementC = double;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementAccumulator = double;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/syrk_tf32n_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/syrk_tf32n_f32t_tensor_op_f32_sm80.cu
new file mode 100644
index 00000000..e6efc248
--- /dev/null
+++ b/test/unit/gemm/device/syrk_tf32n_f32t_tensor_op_f32_sm80.cu
@@ -0,0 +1,541 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide SYRK interface
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/rank_k.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/rank_k_complex.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_rank_k_universal.h"
+
+#if (CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_tf32n_f32t_l_tensor_op_f32, 128x256x32_64x64x32) {
+
+  using ElementA = float;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 256, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_tf32n_f32t_l_tensor_op_f32, 256x128x32_64x64x32) {
+
+  using ElementA = float;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_tf32n_f32t_l_tensor_op_f32, 64x256x32_64x64x32) {
+
+  using ElementA = float;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 256, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_tf32n_f32t_l_tensor_op_f32, 256x64x32_64x64x32) {
+
+  using ElementA = float;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 64, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_tf32n_f32t_l_tensor_op_f32, 128x128x32_64x64x32) {
+
+  using ElementA = float;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+TEST(SM80_Device_Syrk_tf32n_f32t_l_tensor_op_f32, 64x128x32_32x64x32) {
+
+  using ElementA = float;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 32>,
+    cutlass::gemm::GemmShape<32, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+TEST(SM80_Device_Syrk_tf32n_f32t_l_tensor_op_f32, 128x64x32_64x32x32) {
+
+  using ElementA = float;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<32, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_tf32n_f32t_l_tensor_op_f32, 128x128x16_64x64x16) {
+
+  using ElementA = float;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_tf32n_f32t_l_tensor_op_f32, 64x128x16_32x64x16) {
+
+  using ElementA = float;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 16>,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    6
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_tf32n_f32t_u_tensor_op_f32, 128x256x32_64x64x32) {
+
+  using ElementA = float;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 256, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_tf32n_f32t_u_tensor_op_f32, 256x128x32_64x64x32) {
+
+  using ElementA = float;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_tf32n_f32t_u_tensor_op_f32, 64x256x32_64x64x32) {
+
+  using ElementA = float;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 256, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_tf32n_f32t_u_tensor_op_f32, 256x64x32_64x64x32) {
+
+  using ElementA = float;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 64, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_tf32n_f32t_u_tensor_op_f32, 128x128x32_64x64x32) {
+
+  using ElementA = float;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // if (CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/syrk_tf32t_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/syrk_tf32t_f32t_tensor_op_f32_sm80.cu
new file mode 100644
index 00000000..e2c5b237
--- /dev/null
+++ b/test/unit/gemm/device/syrk_tf32t_f32t_tensor_op_f32_sm80.cu
@@ -0,0 +1,541 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide SYRK interface
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/rank_k.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/rank_k_complex.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_rank_k_universal.h"
+
+#if (CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_tf32t_f32t_l_tensor_op_f32, 128x256x32_64x64x32) {
+
+  using ElementA = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 256, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_tf32t_f32t_l_tensor_op_f32, 256x128x32_64x64x32) {
+
+  using ElementA = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_tf32t_f32t_l_tensor_op_f32, 64x256x32_64x64x32) {
+
+  using ElementA = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 256, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_tf32t_f32t_l_tensor_op_f32, 256x64x32_64x64x32) {
+
+  using ElementA = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 64, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_tf32t_f32t_l_tensor_op_f32, 128x128x32_64x64x32) {
+
+  using ElementA = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+TEST(SM80_Device_Syrk_tf32t_f32t_l_tensor_op_f32, 64x128x32_32x64x32) {
+
+  using ElementA = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 32>,
+    cutlass::gemm::GemmShape<32, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+TEST(SM80_Device_Syrk_tf32t_f32t_l_tensor_op_f32, 128x64x32_64x32x32) {
+
+  using ElementA = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<32, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_tf32t_f32t_l_tensor_op_f32, 128x128x16_64x64x16) {
+
+  using ElementA = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_tf32t_f32t_l_tensor_op_f32, 64x128x16_32x64x16) {
+
+  using ElementA = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kLower,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 16>,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    6
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_tf32t_f32t_u_tensor_op_f32, 128x256x32_64x64x32) {
+
+  using ElementA = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 256, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_tf32t_f32t_u_tensor_op_f32, 256x128x32_64x64x32) {
+
+  using ElementA = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_tf32t_f32t_u_tensor_op_f32, 64x256x32_64x64x32) {
+
+  using ElementA = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 256, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_tf32t_f32t_u_tensor_op_f32, 256x64x32_64x64x32) {
+
+  using ElementA = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 64, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Syrk_tf32t_f32t_u_tensor_op_f32, 128x128x32_64x64x32) {
+
+  using ElementA = float;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::RowMajor;
+  using ElementAccumulator = float;
+
+  using RankK = cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    cutlass::FillMode::kUpper,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementC,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllRankKUniversal<RankK>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // if (CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/testbed.h b/test/unit/gemm/device/testbed.h
index 88e7b5b1..9b33e0a2 100644
--- a/test/unit/gemm/device/testbed.h
+++ b/test/unit/gemm/device/testbed.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -46,6 +52,7 @@
 #include "testbed_utils.h"
 
 #include "cutlass/layout/matrix.h"
+#include "cutlass/matrix_coord.h"
 
 namespace test {
 namespace gemm {
@@ -53,7 +60,7 @@ namespace device {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-template <typename Gemm>
+template <typename Gemm, bool Relu = false>
 struct Testbed {
 
   using ElementAccumulator = typename Gemm::ElementAccumulator;
@@ -266,6 +273,17 @@ struct Testbed {
       ElementAccumulator(0)
     );
 
+    if (Relu) {
+      for (int i = 0; i < problem_size.m(); ++i) {
+        for (int j = 0; j < problem_size.n(); ++j) {
+           reference_D.at(cutlass::MatrixCoord(i, j)) = 
+                  ((ElementCompute)reference_D.at(cutlass::MatrixCoord(i, j)) < (ElementCompute)0)
+                  ? (typename Gemm::ElementC)0
+                  : reference_D.at(cutlass::MatrixCoord(i, j));
+        }
+      }
+    }
+
     return compare_reference(problem_size, alpha, beta);
   }
 
@@ -368,7 +386,7 @@ struct Testbed {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-template <typename Gemm>
+template <typename Gemm, bool Relu=false>
 bool TestAllGemm(
     const typename Gemm::LayoutA::Stride& stride_factor_A = typename Gemm::LayoutA::Stride(),
     const typename Gemm::LayoutB::Stride& stride_factor_B = typename Gemm::LayoutB::Stride(),
@@ -418,7 +436,7 @@ bool TestAllGemm(
     2.0
   };
 
-  Testbed<Gemm> testbed(stride_factor_A, stride_factor_B, stride_factor_C);
+  Testbed<Gemm, Relu> testbed(stride_factor_A, stride_factor_B, stride_factor_C);
 
   using ElementCompute = typename Gemm::EpilogueOutputOp::ElementCompute;
 
diff --git a/test/unit/gemm/device/testbed_complex.h b/test/unit/gemm/device/testbed_complex.h
index 4834a6af..561859a4 100644
--- a/test/unit/gemm/device/testbed_complex.h
+++ b/test/unit/gemm/device/testbed_complex.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/testbed_gemm_with_broadcast.h b/test/unit/gemm/device/testbed_gemm_with_broadcast.h
index d0fa8293..4b1da065 100644
--- a/test/unit/gemm/device/testbed_gemm_with_broadcast.h
+++ b/test/unit/gemm/device/testbed_gemm_with_broadcast.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/testbed_gemm_with_reduction.h b/test/unit/gemm/device/testbed_gemm_with_reduction.h
index 1bcc476d..e51eed20 100644
--- a/test/unit/gemm/device/testbed_gemm_with_reduction.h
+++ b/test/unit/gemm/device/testbed_gemm_with_reduction.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/testbed_grouped.h b/test/unit/gemm/device/testbed_grouped.h
index a5064c07..29cffda3 100644
--- a/test/unit/gemm/device/testbed_grouped.h
+++ b/test/unit/gemm/device/testbed_grouped.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/testbed_interleaved.h b/test/unit/gemm/device/testbed_interleaved.h
index e0a9330e..ce4e2c9f 100644
--- a/test/unit/gemm/device/testbed_interleaved.h
+++ b/test/unit/gemm/device/testbed_interleaved.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/testbed_planar_complex.h b/test/unit/gemm/device/testbed_planar_complex.h
index 7ec77088..52aa6982 100644
--- a/test/unit/gemm/device/testbed_planar_complex.h
+++ b/test/unit/gemm/device/testbed_planar_complex.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/testbed_rank2k_universal.h b/test/unit/gemm/device/testbed_rank2k_universal.h
new file mode 100644
index 00000000..4442d99f
--- /dev/null
+++ b/test/unit/gemm/device/testbed_rank2k_universal.h
@@ -0,0 +1,639 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Rank 2k update interface
+  
+*/
+
+#pragma once
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/reference/host/error_metrics.h"
+#include "cutlass/util/reference/host/rank_2k.h"
+#include "cutlass/util/reference/host/rank_2k_complex.h"
+
+#include "testbed_utils.h"
+
+namespace test {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Rank2K>
+struct TestbedRank2KUniversal {
+
+  using ElementAccumulator = typename Rank2K::ElementAccumulator;
+  using ElementCompute = typename Rank2K::Rank2Kkernel::Epilogue::OutputOp::ElementCompute;
+
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+
+  cutlass::HostTensor<typename Rank2K::ElementA, typename Rank2K::LayoutA> tensor_A;
+  cutlass::HostTensor<typename Rank2K::ElementB, typename Rank2K::LayoutB> tensor_B;
+  cutlass::HostTensor<typename Rank2K::ElementC, typename Rank2K::LayoutC> tensor_C;
+  cutlass::HostTensor<typename Rank2K::ElementC, typename Rank2K::LayoutC> tensor_D;
+  cutlass::HostTensor<typename Rank2K::ElementC, typename Rank2K::LayoutC> reference_D;
+
+  //
+  // Methods
+  //
+
+  TestbedRank2KUniversal(
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = 2080
+  ):
+    init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) { }
+
+  /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  bool initialize_tensor(
+    cutlass::TensorView<Element, Layout> view, 
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed,
+    int mantissa_in_bits) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      double scope_max, scope_min;
+      int bits_input = cutlass::sizeof_bits<Element>::value;
+      int bits_output = cutlass::sizeof_bits<typename Rank2K::ElementC>::value;
+
+      if (bits_input == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      } else if (bits_input <= 8) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (bits_output == 16) {
+        scope_max = 5;
+        scope_min = -5;
+      } else {
+        scope_max = 8;
+        scope_min = -8;
+      }
+
+      cutlass::reference::host::TensorFillRandomUniform(
+        view, seed, scope_max, scope_min, mantissa_in_bits);
+    } 
+    else if (dist_kind == cutlass::Distribution::Identity) {
+
+      cutlass::reference::host::TensorFillIdentity(view);
+    } 
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5, mantissa_in_bits);
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      cutlass::reference::host::BlockFillSequential(
+        view.data(), view.capacity());
+    } 
+    else {
+
+      EXPECT_TRUE(false) << "Input distribution not implemented";
+      return false;
+    }
+
+    return true;
+  }
+
+
+  /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  bool initialize_symmetric_tensor(
+    cutlass::TensorView<Element, Layout> view, 
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed,
+    int mantissa_in_bits) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      double scope_max, scope_min;
+      int bits_input = cutlass::sizeof_bits<Element>::value;
+      int bits_output = cutlass::sizeof_bits<typename Rank2K::ElementC>::value;
+
+      if (bits_input == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      } else if (bits_input <= 8) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (bits_output == 16) {
+        scope_max = 5;
+        scope_min = -5;
+      } else {
+        scope_max = 8;
+        scope_min = -8;
+      }
+
+      cutlass::reference::host::TensorFillSymmetricRandomUniform(
+        view, seed, Rank2K::kFillModeC, scope_max, scope_min, mantissa_in_bits);
+    } 
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::host::TensorFillSymmetricRandomGaussian(
+        view, seed, Rank2K::kFillModeC, 0, 0.5, mantissa_in_bits);
+    }
+    else {
+
+      EXPECT_TRUE(false) << "Input distribution (symmetric tensor) not implemented";
+      return false;
+    }
+
+    return true;
+  }
+  /// Initializes data structures
+  void initialize(cutlass::gemm::GemmCoord problem_size) {
+    //
+    // Allocate the Rank2K workspace
+    //
+
+    tensor_A.resize(problem_size.mk());
+    tensor_B.resize(problem_size.mk());
+    tensor_C.resize(problem_size.mn());
+    tensor_D.resize(problem_size.mn());
+    reference_D.resize(problem_size.mn(), false);
+
+    EXPECT_TRUE(initialize_tensor(tensor_A.host_view(), init_A, seed + 2019, cutlass::MantissaInBits<typename Rank2K::ElementA>::bits));
+    EXPECT_TRUE(initialize_tensor(tensor_B.host_view(), init_B, seed + 2018, cutlass::MantissaInBits<typename Rank2K::ElementB>::bits));
+    EXPECT_TRUE(initialize_symmetric_tensor(tensor_C.host_view(), init_C, seed + 2017, cutlass::MantissaInBits<typename Rank2K::ElementC>::bits));
+
+    // It is possible to randomly initialize to all zeros, so override this with non-zeros
+    // in the upper left corner of each operand.
+    tensor_A.host_view().at({0, 0}) = typename Rank2K::ElementA(1);
+    tensor_B.host_view().at({0, 0}) = typename Rank2K::ElementB(1);
+    tensor_C.host_view().at({0, 0}) = typename Rank2K::ElementC(1);
+
+    cutlass::reference::host::TensorCopy(reference_D.host_view(), tensor_C.host_view());
+
+    tensor_A.sync_device();
+    tensor_B.sync_device();
+    tensor_C.sync_device();
+    tensor_D.sync_device();
+  }
+
+  /// Compares computed reference with device reference and outputs to a file if incorrect
+  bool compare_reference(
+    cutlass::gemm::GemmCoord problem_size,
+    ElementCompute alpha, 
+    ElementCompute beta) {
+
+    tensor_D.sync_host();
+
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_A.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_B.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_C.host_view()), 0);
+
+    if (tensor_D.size() > 1)
+      EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_D.host_view()), 0);
+
+    if (reference_D.size() > 1)
+      EXPECT_GT(cutlass::reference::host::TensorNorm(reference_D.host_view()), 0);
+
+    double l2_norm = cutlass::reference::host::TensorRelativeErrorMetric(reference_D.host_view(), tensor_D.host_view());
+
+    bool passed = l2_norm < cutlass::MantissaInBits<typename Rank2K::ElementA>::error;
+
+    return passed;
+  }
+
+  /// Verifies the result is a Rank2K
+  bool verify(
+    cutlass::gemm::GemmCoord problem_size, 
+    ElementCompute alpha, 
+    ElementCompute beta) {
+
+    //
+    // Verify
+    //
+    cutlass::reference::host::Rank2KComplex<
+        typename Rank2K::ElementA, typename Rank2K::LayoutA,
+        typename Rank2K::ElementB, typename Rank2K::LayoutB,
+        typename Rank2K::ElementC, typename Rank2K::LayoutC, 
+        ElementCompute, ElementAccumulator
+    >(
+      problem_size,
+      alpha, 
+      tensor_A.host_ref(),
+      Rank2K::kTransformA,
+      tensor_B.host_ref(),
+      Rank2K::kTransformB,
+      beta, 
+      tensor_C.host_ref(), 
+      reference_D.host_ref(),
+      ElementAccumulator(0),
+      Rank2K::kFillModeC,
+      Rank2K::kBlasMode
+    );
+
+    return compare_reference(problem_size, alpha, beta);
+  }
+
+  /// Returns true if the CUDA device is sufficient to execute the kernel.
+  bool sufficient() const {
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+
+    int smem_size = int(sizeof(typename Rank2K::Rank2Kkernel::SharedStorage));
+
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    result = cudaGetDeviceProperties(&properties, device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    if (properties.sharedMemPerMultiprocessor < smem_size) {
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Executes one test
+  bool run(
+    cutlass::gemm::GemmUniversalMode mode,
+    cutlass::gemm::GemmCoord problem_size,
+    int batch_count = 1,
+    ElementCompute alpha = ElementCompute(1), 
+    ElementCompute beta = ElementCompute(0)) {
+
+    // Waive test if insufficient CUDA device
+    if (!sufficient()) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device." << std::endl;
+      }
+      return true;
+    }
+
+#if 0
+    std::cout << "[TestbedRank2KUniversal::run()] problem(m, n, k): " << problem_size
+              << " alpha: " << ElementCompute(alpha)
+              << " beta: " << ElementCompute(beta) << std::endl;
+#endif
+
+    this->initialize(problem_size);
+
+    //
+    // Initialize the Rank2K operator
+    //
+
+    typename Rank2K::Arguments arguments{
+      mode,
+      problem_size,
+      batch_count,
+      {alpha, beta},
+      tensor_A.device_data(),
+      tensor_B.device_data(),
+      tensor_C.device_data(),
+      tensor_D.device_data(),
+      problem_size.n() * problem_size.k(),
+      problem_size.n() * problem_size.k(),
+      problem_size.m() * problem_size.n(),
+      problem_size.m() * problem_size.n(),
+      tensor_A.layout().stride(0),
+      tensor_B.layout().stride(0),
+      tensor_C.layout().stride(0),
+      tensor_D.layout().stride(0)
+    };
+
+    Rank2K rank2k_op;
+
+    size_t workspace_size = Rank2K::get_workspace_size(arguments);
+
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    cutlass::Status status = rank2k_op.initialize(arguments, workspace.get());
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+
+    //
+    // Run the Rank2K
+    //
+
+    status = rank2k_op();
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+
+    //
+    // Verify
+    //
+
+    bool passed = this->verify(problem_size, alpha, beta);
+
+    //if (true) {
+    if (!passed) {
+      std::stringstream fname;
+
+      fname << "error_Rank2k_device_"
+            << "fill_mode_c_"
+            << (Rank2K::kFillModeC == cutlass::FillMode::kLower ? "lower_" :
+                (Rank2K::kFillModeC == cutlass::FillMode::kUpper ? "upper_" : "invalid_"))
+            << "mnk_"
+            << problem_size.m() << "x"
+            << problem_size.n() << "x"
+            << problem_size.k() << "_"
+            << Rank2K::ThreadblockShape::kM << "x"  
+            << Rank2K::ThreadblockShape::kN << "x"  
+            << Rank2K::ThreadblockShape::kK << "_"
+            << Rank2K::WarpShape::kM << "x"  
+            << Rank2K::WarpShape::kN << "x"  
+            << Rank2K::WarpShape::kK << ".txt";
+
+      std::cout << fname.str() << std::endl;
+
+      std::ofstream results(fname.str());
+
+      results << problem_size << std::endl;
+
+      results
+        << "\nA:\n" << tensor_A.host_view() << "\n"
+        << "\nB:\n" << tensor_B.host_view() << "\n"
+        << "\nC:\n" << tensor_C.host_view() << "\n"
+        << "\nD reference:\n" << reference_D.host_view() << "\n"
+        << "\nD computed:\n" << tensor_D.host_view() << "\n";
+
+    }
+
+    return passed;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename Rank2K>
+bool TestRank2kUniversal(
+  cutlass::gemm::GemmCoord const & problem_size,
+  cutlass::gemm::GemmUniversalMode mode,
+  int batch_count,
+  double alpha = 1.0, 
+  double beta = 2.0) {
+
+  bool passed = true;
+
+  TestbedRank2KUniversal<Rank2K> testbed;
+  
+  using ElementCompute = typename Rank2K::EpilogueOutputOp::ElementCompute;
+
+  passed = testbed.run(
+    mode,
+    problem_size,
+    batch_count,
+    cutlass::from_real<ElementCompute>(alpha), 
+    cutlass::from_real<ElementCompute>(beta)
+  );
+
+  return passed;
+}
+
+template <typename Rank2K>
+bool TestAllRank2KUniversal() {
+  bool passed = true;
+
+
+  int const kMinimumOperandElementSize = int(cutlass::sizeof_bits<typename Rank2K::ElementA>::value);
+
+  int const kAlignment = cutlass::platform::is_same<
+                              typename Rank2K::OperatorClass, 
+                              cutlass::arch::OpClassSimt>::value ? 1 : 128 / kMinimumOperandElementSize;
+
+  // int8_t gemm alignment constraints
+  int const kAlignmentM = cutlass::platform::is_same<typename Rank2K::OperatorClass, cutlass::arch::OpClassSimt>::value &&
+                          cutlass::platform::is_same<typename Rank2K::ElementA, int8_t>::value &&
+                          cutlass::platform::is_same<typename Rank2K::LayoutA, cutlass::layout::ColumnMajor>::value ? 4 : kAlignment;
+
+  int const kAlignmentN = kAlignmentM;
+
+  int const kAlignmentK = cutlass::platform::is_same<typename Rank2K::OperatorClass, cutlass::arch::OpClassSimt>::value &&
+                          cutlass::platform::is_same<typename Rank2K::ElementA, int8_t>::value &&
+                          cutlass::platform::is_same<typename Rank2K::LayoutA, cutlass::layout::RowMajor>::value
+                           ? 4 : kAlignment;
+
+  cutlass::gemm::GemmUniversalMode modes[] = {
+    cutlass::gemm::GemmUniversalMode::kGemm,
+  };
+
+  int problem_size_n[] = {
+    kAlignmentN, 512 - 2*kAlignmentN
+  };
+
+  int problem_size_k[] = {
+    kAlignmentK, 
+    Rank2K::ThreadblockShape::kK * Rank2K::kStages - kAlignmentK, 
+    Rank2K::ThreadblockShape::kK * Rank2K::kStages * 3 - kAlignmentK
+  };
+
+  int batch_counts[] = {      // may be interpretted as batch count or split-K slices
+    1                         // Just running one batch for now (removing 2, 3, 5, 7)
+  };
+
+  double problem_alpha[] = {
+    1.0, 3.25
+  };
+
+  double problem_beta[] = {
+    0.0, 2.15
+  };
+
+  using ElementCompute = typename Rank2K::EpilogueOutputOp::ElementCompute;
+
+  for (cutlass::gemm::GemmUniversalMode mode : modes) {
+    for (int n : problem_size_n) {
+      for (int k : problem_size_k) {
+        for (int batch_count : batch_counts) {
+
+          for (auto alpha : problem_alpha) {
+            for (auto beta : problem_beta) {
+
+              if (mode == cutlass::gemm::GemmUniversalMode::kGemm ||
+                mode == cutlass::gemm::GemmUniversalMode::kGemmSplitKParallel) {
+
+                // skip very small K problems
+                //if (k / batch_count < 2 * Rank2K::ThreadblockShape::kK) {
+                //  continue;
+                //}
+              }
+
+              cutlass::gemm::GemmCoord problem_size(n, n, k);
+
+              TestbedRank2KUniversal<Rank2K> testbed;
+
+              passed = testbed.run(
+                mode,
+                problem_size,
+                batch_count,
+                cutlass::from_real<ElementCompute>(alpha), 
+                cutlass::from_real<ElementCompute>(beta)
+              );
+
+              if (!passed) {
+                return false;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return passed;
+}
+
+template <typename Rank2K>
+bool TestAllRank2KHermitianUniversal() {
+  bool passed = true;
+
+  using ElementCompute = typename Rank2K::EpilogueOutputOp::ElementCompute;
+  using ElementAccumulator = typename Rank2K::ElementAccumulator;
+
+  int const kMinimumOperandElementSize = int(cutlass::sizeof_bits<typename Rank2K::ElementA>::value);
+
+  int const kAlignment = cutlass::platform::is_same<
+                              typename Rank2K::OperatorClass, 
+                              cutlass::arch::OpClassSimt>::value ? 1 : 128 / kMinimumOperandElementSize;
+
+  // int8_t gemm alignment constraints
+  int const kAlignmentM = cutlass::platform::is_same<typename Rank2K::OperatorClass, cutlass::arch::OpClassSimt>::value &&
+                          cutlass::platform::is_same<typename Rank2K::ElementA, int8_t>::value &&
+                          cutlass::platform::is_same<typename Rank2K::LayoutA, cutlass::layout::ColumnMajor>::value ? 4 : kAlignment;
+
+  int const kAlignmentN = kAlignmentM;
+
+  int const kAlignmentK = cutlass::platform::is_same<typename Rank2K::OperatorClass, cutlass::arch::OpClassSimt>::value &&
+                          cutlass::platform::is_same<typename Rank2K::ElementA, int8_t>::value &&
+                          cutlass::platform::is_same<typename Rank2K::LayoutA, cutlass::layout::RowMajor>::value
+                           ? 4 : kAlignment;
+
+  cutlass::gemm::GemmUniversalMode modes[] = {
+    cutlass::gemm::GemmUniversalMode::kGemm,
+  };
+
+  int problem_size_n[] = {
+    kAlignmentN, 512 - 2*kAlignmentN
+  };
+
+  int problem_size_k[] = {
+    kAlignmentK, 
+    Rank2K::ThreadblockShape::kK * Rank2K::kStages - kAlignmentK, 
+    Rank2K::ThreadblockShape::kK * Rank2K::kStages * 3 - kAlignmentK
+  };
+
+  int batch_counts[] = {      // may be interpretted as batch count or split-K slices
+    1                         // Just running one batch for now (removing 2, 3, 5, 7)
+  };
+
+  /* Complex alpha for HER2K */
+  ElementAccumulator problem_alpha[] = {
+    {1.0},
+    {1.25, 3.25},
+    {-0.25, -2.25}
+  };
+
+  ElementAccumulator problem_beta[] = {
+    0.0, -2.25
+  };
+
+  for (cutlass::gemm::GemmUniversalMode mode : modes) {
+    for (int n : problem_size_n) {
+      for (int k : problem_size_k) {
+        for (int batch_count : batch_counts) {
+
+          for (auto alpha : problem_alpha) {
+            for (auto beta : problem_beta) {
+
+              if (mode == cutlass::gemm::GemmUniversalMode::kGemm ||
+                mode == cutlass::gemm::GemmUniversalMode::kGemmSplitKParallel) {
+
+                // skip very small K problems
+                //if (k / batch_count < 2 * Rank2K::ThreadblockShape::kK) {
+                //  continue;
+                //}
+              }
+
+              cutlass::gemm::GemmCoord problem_size(n, n, k);
+
+              TestbedRank2KUniversal<Rank2K> testbed;
+
+              passed = testbed.run(
+                mode,
+                problem_size,
+                batch_count,
+                alpha,
+                beta
+              );
+
+              if (!passed) {
+                return false;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return passed;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace test
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/test/unit/gemm/device/testbed_rank_k_universal.h b/test/unit/gemm/device/testbed_rank_k_universal.h
new file mode 100644
index 00000000..d4a946c5
--- /dev/null
+++ b/test/unit/gemm/device/testbed_rank_k_universal.h
@@ -0,0 +1,509 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Rank 2k update interface
+  
+*/
+
+#pragma once
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/reference/host/error_metrics.h"
+#include "cutlass/util/reference/host/rank_k_complex.h"
+
+#include "testbed_utils.h"
+
+namespace test {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename RankK>
+struct TestbedRank2KUniversal {
+
+  using ElementAccumulator = typename RankK::ElementAccumulator;
+  using ElementCompute = typename RankK::RankKkernel::Epilogue::OutputOp::ElementCompute;
+
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+
+  cutlass::HostTensor<typename RankK::ElementA, typename RankK::LayoutA> tensor_A;
+  cutlass::HostTensor<typename RankK::ElementC, typename RankK::LayoutC> tensor_C;
+  cutlass::HostTensor<typename RankK::ElementC, typename RankK::LayoutC> tensor_D;
+  cutlass::HostTensor<typename RankK::ElementC, typename RankK::LayoutC> reference_D;
+
+  //
+  // Methods
+  //
+
+  TestbedRank2KUniversal(
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = 2080
+  ):
+    init_A(init_A_), init_C(init_C_), seed(seed_) { }
+
+  /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  bool initialize_tensor(
+    cutlass::TensorView<Element, Layout> view, 
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed,
+    int mantissa_in_bits) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      double scope_max, scope_min;
+      int bits_input = cutlass::sizeof_bits<Element>::value;
+      int bits_output = cutlass::sizeof_bits<typename RankK::ElementC>::value;
+
+      if (bits_input == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      } else if (bits_input <= 8) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (bits_output == 16) {
+        scope_max = 5;
+        scope_min = -5;
+      } else {
+        scope_max = 8;
+        scope_min = -8;
+      }
+
+      cutlass::reference::host::TensorFillRandomUniform(
+        view, seed, scope_max, scope_min, mantissa_in_bits);
+    } 
+    else if (dist_kind == cutlass::Distribution::Identity) {
+
+      cutlass::reference::host::TensorFillIdentity(view);
+    } 
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5, mantissa_in_bits);
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      cutlass::reference::host::BlockFillSequential(
+        view.data(), view.capacity());
+    } 
+    else {
+
+      EXPECT_TRUE(false) << "Input distribution not implemented";
+      return false;
+    }
+
+    return true;
+  }
+
+
+  /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  bool initialize_symmetric_tensor(
+    cutlass::TensorView<Element, Layout> view, 
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed,
+    int mantissa_in_bits) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      double scope_max, scope_min;
+      int bits_input = cutlass::sizeof_bits<Element>::value;
+      int bits_output = cutlass::sizeof_bits<typename RankK::ElementC>::value;
+
+      if (bits_input == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      } else if (bits_input <= 8) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (bits_output == 16) {
+        scope_max = 5;
+        scope_min = -5;
+      } else {
+        scope_max = 8;
+        scope_min = -8;
+      }
+
+      cutlass::reference::host::TensorFillSymmetricRandomUniform(
+        view, seed, RankK::kFillModeC, scope_max, scope_min, mantissa_in_bits);
+    } 
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::host::TensorFillSymmetricRandomGaussian(
+        view, seed, RankK::kFillModeC, 0, 0.5, mantissa_in_bits);
+    }
+    else {
+
+      EXPECT_TRUE(false) << "Input distribution (symmetric tensor) not implemented";
+      return false;
+    }
+
+    return true;
+  }
+  /// Initializes data structures
+  void initialize(cutlass::gemm::GemmCoord problem_size) {
+    //
+    // Allocate the RankK workspace
+    //
+
+    tensor_A.resize(problem_size.mk());
+    tensor_C.resize(problem_size.mn());
+    tensor_D.resize(problem_size.mn());
+    reference_D.resize(problem_size.mn(), false);
+
+    EXPECT_TRUE(initialize_tensor(tensor_A.host_view(), init_A, seed + 2019, cutlass::MantissaInBits<typename RankK::ElementA>::bits));
+    EXPECT_TRUE(initialize_symmetric_tensor(tensor_C.host_view(), init_C, seed + 2017, cutlass::MantissaInBits<typename RankK::ElementC>::bits));
+
+    // It is possible to randomly initialize to all zeros, so override this with non-zeros
+    // in the upper left corner of each operand.
+    tensor_A.host_view().at({0, 0}) = typename RankK::ElementA(1);
+    tensor_C.host_view().at({0, 0}) = typename RankK::ElementC(1);
+
+    cutlass::reference::host::TensorCopy(reference_D.host_view(), tensor_C.host_view());
+
+    tensor_A.sync_device();
+    tensor_C.sync_device();
+    tensor_D.sync_device();
+  }
+
+  /// Compares computed reference with device reference and outputs to a file if incorrect
+  bool compare_reference(
+    cutlass::gemm::GemmCoord problem_size,
+    ElementCompute alpha, 
+    ElementCompute beta) {
+
+    tensor_D.sync_host();
+
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_A.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_C.host_view()), 0);
+
+    if (tensor_D.size() > 1)
+      EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_D.host_view()), 0);
+
+    if (reference_D.size() > 1)
+      EXPECT_GT(cutlass::reference::host::TensorNorm(reference_D.host_view()), 0);
+
+    double l2_norm = cutlass::reference::host::TensorRelativeErrorMetric(reference_D.host_view(), tensor_D.host_view());
+
+    bool passed = l2_norm < cutlass::MantissaInBits<typename RankK::ElementA>::error;
+
+    return passed;
+  }
+
+  /// Verifies the result is a RankK
+  bool verify(
+    cutlass::gemm::GemmCoord problem_size, 
+    ElementCompute alpha, 
+    ElementCompute beta) {
+
+    //
+    // Verify
+    //
+    cutlass::reference::host::Rank2KComplex<
+        typename RankK::ElementA, typename RankK::LayoutA,
+        typename RankK::ElementC, typename RankK::LayoutC, 
+        ElementCompute, ElementAccumulator
+    >(
+      problem_size,
+      alpha, 
+      tensor_A.host_ref(),
+      RankK::kTransformA,
+      beta, 
+      tensor_C.host_ref(), 
+      reference_D.host_ref(),
+      ElementAccumulator(0),
+      RankK::kFillModeC,
+      RankK::kBlasMode
+    );
+
+    return compare_reference(problem_size, alpha, beta);
+  }
+
+  /// Returns true if the CUDA device is sufficient to execute the kernel.
+  bool sufficient() const {
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+
+    int smem_size = int(sizeof(typename RankK::RankKkernel::SharedStorage));
+
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    result = cudaGetDeviceProperties(&properties, device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    if (properties.sharedMemPerMultiprocessor < smem_size) {
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Executes one test
+  bool run(
+    cutlass::gemm::GemmUniversalMode mode,
+    cutlass::gemm::GemmCoord problem_size,
+    int batch_count = 1,
+    ElementCompute alpha = ElementCompute(1), 
+    ElementCompute beta = ElementCompute(0)) {
+
+    // Waive test if insufficient CUDA device
+    if (!sufficient()) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device." << std::endl;
+      }
+      return true;
+    }
+
+#if 0
+    std::cout << "[TestbedRankKUniversal::run()] problem(m, n, k): " << problem_size
+              << " alpha: " << ElementCompute(alpha)
+              << " beta: " << ElementCompute(beta) << std::endl;
+#endif
+
+    this->initialize(problem_size);
+
+    //
+    // Initialize the RankK operator
+    //
+
+    typename RankK::Arguments arguments{
+      mode,
+      problem_size,
+      batch_count,
+      {alpha, beta},
+      tensor_A.device_data(),
+      tensor_C.device_data(),
+      tensor_D.device_data(),
+      problem_size.n() * problem_size.k(),
+      problem_size.m() * problem_size.n(),
+      problem_size.m() * problem_size.n(),
+      tensor_A.layout().stride(0),
+      tensor_C.layout().stride(0),
+      tensor_D.layout().stride(0)
+    };
+
+    RankK rank2k_op;
+
+    size_t workspace_size = RankK::get_workspace_size(arguments);
+
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    cutlass::Status status = rank2k_op.initialize(arguments, workspace.get());
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+
+    //
+    // Run the RankK
+    //
+
+    status = rank2k_op();
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+
+    //
+    // Verify
+    //
+
+    bool passed = this->verify(problem_size, alpha, beta);
+
+    //if (true) {
+    if (!passed) {
+      std::stringstream fname;
+
+      fname << "error_RankK_device_"
+            << "fill_mode_c_"
+            << (RankK::kFillModeC == cutlass::FillMode::kLower ? "lower_" :
+                (RankK::kFillModeC == cutlass::FillMode::kUpper ? "upper_" : "invalid_"))
+            << "mnk_"
+            << problem_size.m() << "x"
+            << problem_size.n() << "x"
+            << problem_size.k() << "_"
+            << RankK::ThreadblockShape::kM << "x"  
+            << RankK::ThreadblockShape::kN << "x"  
+            << RankK::ThreadblockShape::kK << "_"
+            << RankK::WarpShape::kM << "x"  
+            << RankK::WarpShape::kN << "x"  
+            << RankK::WarpShape::kK << ".txt";
+
+      std::cout << fname.str() << std::endl;
+
+      std::ofstream results(fname.str());
+
+      results << problem_size << std::endl;
+
+      results
+        << "\nA:\n" << tensor_A.host_view() << "\n"
+        << "\nC:\n" << tensor_C.host_view() << "\n"
+        << "\nD reference:\n" << reference_D.host_view() << "\n"
+        << "\nD computed:\n" << tensor_D.host_view() << "\n";
+
+    }
+
+    return passed;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename RankK>
+bool TestRank2kUniversal(
+  cutlass::gemm::GemmCoord const & problem_size,
+  cutlass::gemm::GemmUniversalMode mode,
+  int batch_count,
+  double alpha = 1.0, 
+  double beta = 2.0) {
+
+  bool passed = true;
+
+  TestbedRank2KUniversal<RankK> testbed;
+  
+  using ElementCompute = typename RankK::EpilogueOutputOp::ElementCompute;
+
+  passed = testbed.run(
+    mode,
+    problem_size,
+    batch_count,
+    cutlass::from_real<ElementCompute>(alpha), 
+    cutlass::from_real<ElementCompute>(beta)
+  );
+
+  return passed;
+}
+
+template <typename RankK>
+bool TestAllRankKUniversal() {
+  bool passed = true;
+
+
+  int const kMinimumOperandElementSize = int(cutlass::sizeof_bits<typename RankK::ElementA>::value);
+  int const kAlignmentN = 128 / kMinimumOperandElementSize;
+  int const kAlignmentK = 128 / kMinimumOperandElementSize;
+
+  cutlass::gemm::GemmUniversalMode modes[] = {
+    cutlass::gemm::GemmUniversalMode::kGemm,
+  };
+
+  int problem_size_n[] = {
+    kAlignmentN, 512 - 2*kAlignmentN
+  };
+
+  int problem_size_k[] = {
+    kAlignmentK, 
+    RankK::ThreadblockShape::kK * RankK::kStages - kAlignmentK, 
+    RankK::ThreadblockShape::kK * RankK::kStages * 3 - kAlignmentK
+  };
+
+  int batch_counts[] = {      // may be interpretted as batch count or split-K slices
+    1                         // Just running one batch for now (removing 2, 3, 5, 7)
+  };
+
+  double problem_alpha[] = {
+    1.0
+  };
+
+  double problem_beta[] = {
+    2.0
+  };
+
+
+  using ElementCompute = typename RankK::EpilogueOutputOp::ElementCompute;
+
+  for (cutlass::gemm::GemmUniversalMode mode : modes) {
+    for (int n : problem_size_n) {
+      for (int k : problem_size_k) {
+        for (int batch_count : batch_counts) {
+
+          for (auto alpha : problem_alpha) {
+            for (auto beta : problem_beta) {
+
+              if (mode == cutlass::gemm::GemmUniversalMode::kGemm ||
+                mode == cutlass::gemm::GemmUniversalMode::kGemmSplitKParallel) {
+              }
+
+              cutlass::gemm::GemmCoord problem_size(n, n, k);
+
+              TestbedRank2KUniversal<RankK> testbed;
+
+              passed = testbed.run(
+                mode,
+                problem_size,
+                batch_count,
+                cutlass::from_real<ElementCompute>(alpha), 
+                cutlass::from_real<ElementCompute>(beta)
+              );
+
+              if (!passed) {
+                return false;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return passed;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace test
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/test/unit/gemm/device/testbed_sanity.h b/test/unit/gemm/device/testbed_sanity.h
index e68e89d0..d7f63c3c 100644
--- a/test/unit/gemm/device/testbed_sanity.h
+++ b/test/unit/gemm/device/testbed_sanity.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/testbed_sparse.h b/test/unit/gemm/device/testbed_sparse.h
index 24581a62..1509d9dd 100644
--- a/test/unit/gemm/device/testbed_sparse.h
+++ b/test/unit/gemm/device/testbed_sparse.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/testbed_splitk.h b/test/unit/gemm/device/testbed_splitk.h
index e120a629..8ad85a9e 100644
--- a/test/unit/gemm/device/testbed_splitk.h
+++ b/test/unit/gemm/device/testbed_splitk.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/testbed_symm_universal.h b/test/unit/gemm/device/testbed_symm_universal.h
new file mode 100644
index 00000000..14218d64
--- /dev/null
+++ b/test/unit/gemm/device/testbed_symm_universal.h
@@ -0,0 +1,589 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Symm update interface
+  
+*/
+
+#pragma once
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/reference/host/error_metrics.h"
+#include "cutlass/util/reference/host/symm.h"
+#include "cutlass/util/reference/host/symm_complex.h"
+
+#include "testbed_utils.h"
+
+namespace test {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Symm>
+struct TestbedSymmUniversal {
+
+  using ElementAccumulator = typename Symm::ElementAccumulator;
+  using ElementCompute = typename Symm::SymmKernel::Epilogue::OutputOp::ElementCompute;
+
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+
+  cutlass::HostTensor<typename Symm::ElementA, typename Symm::LayoutA> tensor_A;
+  cutlass::HostTensor<typename Symm::ElementB, typename Symm::LayoutB> tensor_B;
+  cutlass::HostTensor<typename Symm::ElementC, typename Symm::LayoutC> tensor_C;
+  cutlass::HostTensor<typename Symm::ElementC, typename Symm::LayoutC> tensor_D;
+  cutlass::HostTensor<typename Symm::ElementC, typename Symm::LayoutC> reference_D;
+
+  //
+  // Methods
+  //
+
+  TestbedSymmUniversal(
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = 2080
+  ):
+    init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) { }
+
+  /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  bool initialize_tensor(
+    cutlass::TensorView<Element, Layout> view, 
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed,
+    int mantissa_in_bits) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      double scope_max, scope_min;
+      int bits_input = cutlass::sizeof_bits<Element>::value;
+      int bits_output = cutlass::sizeof_bits<typename Symm::ElementC>::value;
+
+      if (bits_input == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      } else if (bits_input <= 8) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (bits_output == 16) {
+        scope_max = 5;
+        scope_min = -5;
+      } else {
+        scope_max = 8;
+        scope_min = -8;
+      }
+
+      cutlass::reference::host::TensorFillRandomUniform(
+        view, seed, scope_max, scope_min, mantissa_in_bits);
+    } 
+    else if (dist_kind == cutlass::Distribution::Identity) {
+
+      cutlass::reference::host::TensorFillIdentity(view);
+    } 
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5, mantissa_in_bits);
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      cutlass::reference::host::BlockFillSequential(
+        view.data(), view.capacity());
+    } 
+    else {
+
+      EXPECT_TRUE(false) << "Input distribution not implemented";
+      return false;
+    }
+
+    return true;
+  }
+
+
+  /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  bool initialize_symmetric_tensor(
+    cutlass::TensorView<Element, Layout> view, 
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed,
+    int mantissa_in_bits) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      double scope_max, scope_min;
+      int bits_input = cutlass::sizeof_bits<Element>::value;
+      int bits_output = cutlass::sizeof_bits<typename Symm::ElementC>::value;
+
+      if (bits_input == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      } else if (bits_input <= 8) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (bits_output == 16) {
+        scope_max = 5;
+        scope_min = -5;
+      } else {
+        scope_max = 8;
+        scope_min = -8;
+      }
+
+      cutlass::reference::host::TensorFillSymmetricRandomUniform(
+        view, seed, Symm::kFillModeA, scope_max, scope_min, mantissa_in_bits);
+    } 
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::host::TensorFillSymmetricRandomGaussian(
+        view, seed, Symm::kFillModeA, 0, 0.5, mantissa_in_bits);
+    }
+    else {
+
+      EXPECT_TRUE(false) << "Input distribution (symmetric tensor) not implemented";
+      return false;
+    }
+
+    return true;
+  }
+  /// Initializes data structures
+  void initialize(cutlass::gemm::GemmCoord problem_size) {
+    //
+    // Allocate the Symm workspace
+    //
+
+    if (Symm::kSideModeA == cutlass::SideMode::kLeft) {
+      tensor_A.resize(cutlass::make_Coord(problem_size.m(),problem_size.m()));
+    }
+    else if (Symm::kSideModeA == cutlass::SideMode::kRight) {
+      tensor_A.resize(cutlass::make_Coord(problem_size.n(),problem_size.n()));
+    }
+
+    tensor_B.resize(problem_size.mn());
+    tensor_C.resize(problem_size.mn());
+    tensor_D.resize(problem_size.mn());
+    reference_D.resize(problem_size.mn(), false);
+
+    EXPECT_TRUE(initialize_symmetric_tensor(tensor_A.host_view(), init_A, seed + 2019, cutlass::MantissaInBits<typename Symm::ElementA>::bits));
+    EXPECT_TRUE(initialize_tensor(tensor_B.host_view(), init_B, seed + 2018, cutlass::MantissaInBits<typename Symm::ElementB>::bits));
+    EXPECT_TRUE(initialize_tensor(tensor_C.host_view(), init_C, seed + 2017, cutlass::MantissaInBits<typename Symm::ElementC>::bits));
+
+    // It is possible to randomly initialize to all zeros, so override this with non-zeros
+    // in the upper left corner of each operand.
+    tensor_A.host_view().at({0, 0}) = typename Symm::ElementA(1);
+    tensor_B.host_view().at({0, 0}) = typename Symm::ElementB(1);
+    tensor_C.host_view().at({0, 0}) = typename Symm::ElementC(1);
+
+    cutlass::reference::host::TensorCopy(reference_D.host_view(), tensor_C.host_view());
+
+    tensor_A.sync_device();
+    tensor_B.sync_device();
+    tensor_C.sync_device();
+    tensor_D.sync_device();
+  }
+
+  /// Compares computed reference with device reference and outputs to a file if incorrect
+  bool compare_reference(
+    cutlass::gemm::GemmCoord problem_size,
+    ElementCompute alpha, 
+    ElementCompute beta) {
+
+    tensor_D.sync_host();
+
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_A.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_B.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_C.host_view()), 0);
+
+    if (tensor_D.size() > 1)
+      EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_D.host_view()), 0);
+
+    if (reference_D.size() > 1)
+      EXPECT_GT(cutlass::reference::host::TensorNorm(reference_D.host_view()), 0);
+
+    double l2_norm = cutlass::reference::host::TensorRelativeErrorMetric(reference_D.host_view(), tensor_D.host_view());
+
+    bool passed = l2_norm < cutlass::MantissaInBits<typename Symm::ElementA>::error;
+
+    return passed;
+  }
+
+  /// Verifies the result is a Symm
+  bool verify(
+    cutlass::gemm::GemmCoord problem_size, 
+    ElementCompute alpha, 
+    ElementCompute beta) {
+
+    //
+    // Verify
+    //
+
+    using HostReference = typename cutlass::platform::conditional<
+                              (cutlass::platform::is_same<typename Symm::ElementC,
+                                                          cutlass::complex<double>
+                                                         >::value ||
+                              cutlass::platform::is_same<typename Symm::ElementC,
+                                                          cutlass::complex<float>
+                                                         >::value
+                              ), 
+                              cutlass::reference::host::SymmComplex<
+                                  typename Symm::ElementA, typename Symm::LayoutA,
+                                  Symm::kSideModeA, Symm::kFillModeA,
+                                  typename Symm::ElementB, typename Symm::LayoutB,
+                                  typename Symm::ElementC, typename Symm::LayoutC, 
+                                  ElementCompute,
+                                  ElementAccumulator,
+                                  Symm::kBlasMode>,
+                              cutlass::reference::host::Symm<
+                                  typename Symm::ElementA, typename Symm::LayoutA,
+                                  Symm::kSideModeA, Symm::kFillModeA, 
+                                  typename Symm::ElementB, typename Symm::LayoutB,
+                                  typename Symm::ElementC, typename Symm::LayoutC, 
+                                  ElementCompute,
+                                  ElementAccumulator>
+                           >::type;
+
+
+    HostReference reference_symm;
+
+    reference_symm(
+      problem_size,
+      alpha, 
+      tensor_A.host_ref(),
+      tensor_B.host_ref(),
+      beta, 
+      tensor_C.host_ref(), 
+      reference_D.host_ref(),
+      ElementAccumulator(0)
+    );
+
+    return compare_reference(problem_size, alpha, beta);
+  }
+
+  /// Returns true if the CUDA device is sufficient to execute the kernel.
+  bool sufficient() const {
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+
+    int smem_size = int(sizeof(typename Symm::SymmKernel::SharedStorage));
+
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    result = cudaGetDeviceProperties(&properties, device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    if (properties.sharedMemPerMultiprocessor < smem_size) {
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Executes one test
+  bool run(
+    cutlass::gemm::GemmUniversalMode mode,
+    cutlass::gemm::GemmCoord problem_size,
+    int batch_count = 1,
+    ElementCompute alpha = ElementCompute(1), 
+    ElementCompute beta = ElementCompute(0)) {
+
+    // Waive test if insufficient CUDA device
+    if (!sufficient()) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device." << std::endl;
+      }
+      return true;
+    }
+
+#if 0
+    std::cout << "[TestbedSymmUniversal::run()] problem(m, n, k): " << problem_size
+              << " alpha: " << ElementCompute(alpha)
+              << " beta: " << ElementCompute(beta) << std::endl;
+#endif
+
+    this->initialize(problem_size);
+
+    //
+    // Initialize the Symm operator
+    //
+
+    int batch_stride_A;
+    if (Symm::kSideModeA == cutlass::SideMode::kLeft)
+      batch_stride_A = problem_size.m()*problem_size.m();
+    if (Symm::kSideModeA == cutlass::SideMode::kRight)
+      batch_stride_A = problem_size.n()*problem_size.n();
+
+    typename Symm::Arguments arguments{
+      mode,
+      problem_size,
+      batch_count,
+      {alpha, beta},
+      tensor_A.device_data(),
+      tensor_B.device_data(),
+      tensor_C.device_data(),
+      tensor_D.device_data(),
+      batch_stride_A,
+      problem_size.m() * problem_size.n(),
+      problem_size.m() * problem_size.n(),
+      problem_size.m() * problem_size.n(),
+      tensor_A.layout().stride(0),
+      tensor_B.layout().stride(0),
+      tensor_C.layout().stride(0),
+      tensor_D.layout().stride(0)
+    };
+
+    Symm symm_op;
+
+    size_t workspace_size = Symm::get_workspace_size(arguments);
+
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    cutlass::Status status = symm_op.initialize(arguments, workspace.get());
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+
+    //
+    // Run the Symm
+    //
+
+    status = symm_op();
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+
+    //
+    // Verify
+    //
+
+    bool passed = this->verify(problem_size, alpha, beta);
+
+    //if (true) {
+    if (!passed) {
+      std::stringstream fname;
+
+      fname << "error_"
+            << (Symm::kBlasMode == cutlass::BlasMode::kSymmetric ? "symm_" : "hemm_" )
+            << "device_"
+            << "fill_mode_a_"
+            << (Symm::kSideModeA == cutlass::SideMode::kLeft ? "leftside_" :
+                (Symm::kSideModeA == cutlass::SideMode::kRight ? "rightside_" : "invalid_"))            
+            << (Symm::kFillModeA == cutlass::FillMode::kLower ? "lower_" :
+                (Symm::kFillModeA == cutlass::FillMode::kUpper ? "upper_" : "invalid_"))
+            << "mnk_"
+            << problem_size.m() << "x"
+            << problem_size.n() << "x"
+            << problem_size.k() << "_"
+            << Symm::ThreadblockShape::kM << "x"  
+            << Symm::ThreadblockShape::kN << "x"  
+            << Symm::ThreadblockShape::kK << "_"
+            << Symm::WarpShape::kM << "x"  
+            << Symm::WarpShape::kN << "x"  
+            << Symm::WarpShape::kK << ".txt";
+
+      std::cout << fname.str() << std::endl;
+
+      std::ofstream results(fname.str());
+
+      results << problem_size << std::endl;
+
+      results
+        << "alpha: " << ElementCompute(alpha) << "\n"
+        << "beta: "  << ElementCompute(beta) << "\n"
+        << "\nA:\n" << tensor_A.host_view() << "\n"
+        << "\nB:\n" << tensor_B.host_view() << "\n"
+        << "\nC:\n" << tensor_C.host_view() << "\n"
+        << "\nD reference:\n" << reference_D.host_view() << "\n"
+        << "\nD computed:\n" << tensor_D.host_view() << "\n";
+
+    }
+
+    return passed;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename Symm>
+bool TestsymmUniversal(
+  cutlass::gemm::GemmCoord const & problem_size,
+  cutlass::gemm::GemmUniversalMode mode,
+  int batch_count,
+  double alpha = 1.0, 
+  double beta = 2.0) {
+
+  bool passed = true;
+
+  TestbedSymmUniversal<Symm> testbed;
+  
+  using ElementCompute = typename Symm::EpilogueOutputOp::ElementCompute;
+
+  passed = testbed.run(
+    mode,
+    problem_size,
+    batch_count,
+    cutlass::from_real<ElementCompute>(alpha), 
+    cutlass::from_real<ElementCompute>(beta)
+  );
+
+  return passed;
+}
+
+template <typename Symm>
+bool TestAllSymmUniversal() {
+  bool passed = true;
+
+
+  int const kMinimumOperandElementSize = int(cutlass::sizeof_bits<typename Symm::ElementA>::value);
+
+  int const kAlignment = cutlass::platform::is_same<
+                              typename Symm::OperatorClass, 
+                              cutlass::arch::OpClassSimt>::value ? 1 : 128 / kMinimumOperandElementSize;
+
+  // int8_t gemm alignment constraints
+  int const kAlignmentM = cutlass::platform::is_same<typename Symm::OperatorClass, cutlass::arch::OpClassSimt>::value &&
+                          cutlass::platform::is_same<typename Symm::ElementA, int8_t>::value &&
+                          cutlass::platform::is_same<typename Symm::LayoutA, cutlass::layout::ColumnMajor>::value ? 4 : kAlignment;
+
+  int const kAlignmentN = kAlignmentM;
+
+  int const kAlignmentK = cutlass::platform::is_same<typename Symm::OperatorClass, cutlass::arch::OpClassSimt>::value &&
+                          cutlass::platform::is_same<typename Symm::ElementA, int8_t>::value &&
+                          cutlass::platform::is_same<typename Symm::LayoutA, cutlass::layout::RowMajor>::value
+                           ? 4 : kAlignment;
+
+  cutlass::gemm::GemmUniversalMode modes[] = {
+    cutlass::gemm::GemmUniversalMode::kGemm,
+  };
+
+  int problem_size_m[] = {
+    kAlignmentK, 
+    Symm::ThreadblockShape::kK * Symm::kStages - kAlignmentK, 
+    Symm::ThreadblockShape::kK * Symm::kStages * 3 - kAlignmentK
+  };
+
+  int problem_size_n[] = {
+    kAlignmentN, 512 - 2*kAlignmentN
+  };
+
+  int batch_counts[] = {      // may be interpretted as batch count or split-K slices
+    1                         // Just running one batch for now (removing 2, 3, 5, 7)
+  };
+
+  double problem_alpha[] = {
+    1.0, 3.0
+  };
+
+  double problem_beta[] = {
+    0, 2.0
+  };
+
+
+  using ElementCompute = typename Symm::EpilogueOutputOp::ElementCompute;
+
+  for (cutlass::gemm::GemmUniversalMode mode : modes) {
+    for (int m : problem_size_m) {
+      for (int n : problem_size_n) {
+        for (int batch_count : batch_counts) {
+
+          for (auto alpha : problem_alpha) {
+            for (auto beta : problem_beta) {
+              
+              int k = 0;
+              if (Symm::kSideModeA == cutlass::SideMode::kLeft)
+                k = m;
+              else if (Symm::kSideModeA == cutlass::SideMode::kRight)
+                k = n;
+
+              if (mode == cutlass::gemm::GemmUniversalMode::kGemm ||
+                mode == cutlass::gemm::GemmUniversalMode::kGemmSplitKParallel) {
+
+  #if 0
+                // skip very small K problems
+                if (k / batch_count < 2 * Symm::ThreadblockShape::kK) {
+                  continue;
+                }
+  #endif
+              }
+
+              cutlass::gemm::GemmCoord problem_size(m, n, k);
+
+              TestbedSymmUniversal<Symm> testbed;
+
+              passed = testbed.run(
+                mode,
+                problem_size,
+                batch_count,
+                cutlass::from_real<ElementCompute>(alpha), 
+                cutlass::from_real<ElementCompute>(beta)
+              );
+
+              if (!passed) {
+                return false;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return passed;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace test
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/gemm/device/testbed_trmm_universal.h b/test/unit/gemm/device/testbed_trmm_universal.h
new file mode 100644
index 00000000..13c3c44f
--- /dev/null
+++ b/test/unit/gemm/device/testbed_trmm_universal.h
@@ -0,0 +1,606 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide TRMM interface
+
+  
+*/
+
+#pragma once
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/reference/host/error_metrics.h"
+#include "cutlass/util/reference/host/trmm.h"
+#include "cutlass/util/reference/host/trmm_complex.h"
+#include "cutlass/core_io.h"
+
+#include "testbed_utils.h"
+
+namespace test {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Trmm>
+struct TestbedTrmmUniversal {
+
+  using ElementAccumulator = typename Trmm::ElementAccumulator;
+  using ElementCompute = typename Trmm::TrmmKernel::Epilogue::OutputOp::ElementCompute;
+
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_D;
+  uint64_t seed;
+
+  cutlass::HostTensor<typename Trmm::ElementA, typename Trmm::LayoutA> tensor_A;
+  cutlass::HostTensor<typename Trmm::ElementB, typename Trmm::LayoutB> tensor_B;
+  cutlass::HostTensor<typename Trmm::ElementC, typename Trmm::LayoutC> tensor_D;
+  cutlass::HostTensor<typename Trmm::ElementC, typename Trmm::LayoutC> reference_D;
+
+  //
+  // Methods
+  //
+
+  TestbedTrmmUniversal(
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_D_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = 2080
+  ):
+    init_A(init_A_), init_B(init_B_), init_D(init_D_), seed(seed_) { }
+
+  /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  bool initialize_tensor(
+    cutlass::TensorView<Element, Layout> view, 
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed,
+    int mantissa_in_bits) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      double scope_max, scope_min;
+      int bits_input = cutlass::sizeof_bits<Element>::value;
+      int bits_output = cutlass::sizeof_bits<typename Trmm::ElementC>::value;
+
+      if (bits_input == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      } else if (bits_input <= 8) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (bits_output == 16) {
+        scope_max = 5;
+        scope_min = -5;
+      } else {
+        scope_max = 8;
+        scope_min = -8;
+      }
+
+      cutlass::reference::host::TensorFillRandomUniform(
+        view, seed, scope_max, scope_min, mantissa_in_bits);
+    } 
+    else if (dist_kind == cutlass::Distribution::Identity) {
+
+      cutlass::reference::host::TensorFillIdentity(view);
+    } 
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5, mantissa_in_bits);
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      cutlass::reference::host::BlockFillSequential(
+        view.data(), view.capacity());
+    } 
+    else {
+      // TODO: Implement the rest
+      EXPECT_TRUE(false) << "Not implemented";
+      return false;
+    }
+
+    return true;
+  }
+
+
+  /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  bool initialize_symmetric_tensor(
+    cutlass::TensorView<Element, Layout> view, 
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed,
+    int mantissa_in_bits) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      double scope_max, scope_min;
+      int bits_input = cutlass::sizeof_bits<Element>::value;
+      int bits_output = cutlass::sizeof_bits<typename Trmm::ElementC>::value;
+
+      if (bits_input == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      } else if (bits_input <= 8) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (bits_output == 16) {
+        scope_max = 5;
+        scope_min = -5;
+      } else {
+        scope_max = 8;
+        scope_min = -8;
+      }
+
+      cutlass::reference::host::TensorFillSymmetricRandomUniform(
+        view, seed, Trmm::kFillMode, scope_max, scope_min, mantissa_in_bits);
+    } 
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::host::TensorFillSymmetricRandomGaussian(
+        view, seed, Trmm::kFillMode, 0, 0.5, mantissa_in_bits);
+    }
+    else {
+      // TODO: Implement the rest
+      EXPECT_TRUE(false) << "Not implemented";
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Helper to initialize a tensor view (pad diagonal fill with zeros for up to alignment on wrong side of diagonal)
+  template <typename Element, typename Layout>
+  bool initialize_pad_diagonal_tensor(
+    cutlass::TensorView<Element, Layout> view, 
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed,
+    int alignment) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      double scope_max, scope_min;
+      int bits_input = cutlass::sizeof_bits<Element>::value;
+      int bits_output = cutlass::sizeof_bits<typename Trmm::ElementC>::value;
+
+      if (bits_input == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      } else if (bits_input <= 8) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (bits_output == 16) {
+        scope_max = 5;
+        scope_min = -5;
+      } else {
+        scope_max = 8;
+        scope_min = -8;
+      }
+
+      cutlass::reference::host::TensorFillPadDiagonalRandomUniform(
+        view, seed, Trmm::kFillMode, scope_max, scope_min, 0, alignment);
+    } 
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      EXPECT_TRUE(false) << "Gaussian distribution for pad diagonal not implemented";
+    }
+    else {
+      // TODO: Implement the rest
+      EXPECT_TRUE(false) << "Not implemented";
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Initializes data structures
+  void initialize(cutlass::gemm::GemmCoord problem_size) {
+    //
+    // Allocate the TRMM workspace
+    //
+
+    if (Trmm::kSideMode == cutlass::SideMode::kLeft) {
+      tensor_A.resize(cutlass::make_Coord(problem_size.m(),problem_size.m()));
+    }
+    else if (Trmm::kSideMode == cutlass::SideMode::kRight) {
+      tensor_A.resize(cutlass::make_Coord(problem_size.n(),problem_size.n()));
+    }
+
+    tensor_B.resize(problem_size.mn());
+    tensor_D.resize(problem_size.mn());
+    reference_D.resize(problem_size.mn(), false);
+
+    //EXPECT_TRUE(initialize_symmetric_tensor(tensor_A.host_view(), init_A, seed + 2017));
+    //EXPECT_TRUE(initialize_pad_diagonal_tensor(tensor_A.host_view(), init_A, seed + 2017, Trmm::kAlignmentA));
+    EXPECT_TRUE(initialize_tensor(tensor_A.host_view(), init_A, seed + 2017, cutlass::MantissaInBits<typename Trmm::ElementA>::bits));
+    EXPECT_TRUE(initialize_tensor(tensor_B.host_view(), init_B, seed + 2019, cutlass::MantissaInBits<typename Trmm::ElementB>::bits));
+
+    // It is possible to randomly initialize to all zeros, so override this with non-zeros
+    // in the upper left corner of each operand.
+    tensor_A.host_view().at({0, 0}) = typename Trmm::ElementA(1);
+    tensor_B.host_view().at({0, 0}) = typename Trmm::ElementB(1);
+
+    cutlass::reference::host::TensorCopy(reference_D.host_view(), tensor_D.host_view());
+
+    tensor_A.sync_device();
+    tensor_B.sync_device();
+    tensor_D.sync_device();
+  }
+
+  /// Compares computed reference with device reference and outputs to a file if incorrect
+  bool compare_reference(
+    cutlass::gemm::GemmCoord problem_size,
+    ElementCompute alpha) {
+
+    tensor_D.sync_host();
+
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_A.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_B.host_view()), 0);
+
+    if (tensor_D.size() > 1)
+      EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_D.host_view()), 0);
+
+    if (reference_D.size() > 1)
+      EXPECT_GT(cutlass::reference::host::TensorNorm(reference_D.host_view()), 0);
+
+    double l2_norm = cutlass::reference::host::TensorRelativeErrorMetric(reference_D.host_view(), tensor_D.host_view());
+
+    bool passed = l2_norm < cutlass::MantissaInBits<typename Trmm::ElementA>::error;
+
+    return passed;
+  }
+
+  /// Verifies the result is a TRMM
+  bool verify(
+    cutlass::gemm::GemmCoord problem_size, 
+    ElementCompute alpha) {
+
+    //
+    // Verify
+    //
+
+    using HostReference = typename cutlass::platform::conditional<
+                              (cutlass::platform::is_same<typename Trmm::ElementC,
+                                                          cutlass::complex<double>
+                                                         >::value ||
+                              cutlass::platform::is_same<typename Trmm::ElementC,
+                                                          cutlass::complex<float>
+                                                         >::value
+                              ), 
+                              cutlass::reference::host::TrmmComplex<
+                                  typename Trmm::ElementA, typename Trmm::LayoutA,
+                                  Trmm::kTransformA,
+                                  Trmm::kSideMode, Trmm::kFillMode, Trmm::kDiagType,
+                                  typename Trmm::ElementB, typename Trmm::LayoutB,
+                                  Trmm::kTransformB,
+                                  typename Trmm::ElementC, typename Trmm::LayoutC, 
+                                  ElementCompute,
+                                  ElementAccumulator>,
+                              cutlass::reference::host::Trmm<
+                                  typename Trmm::ElementA, typename Trmm::LayoutA,
+                                  Trmm::kSideMode, Trmm::kFillMode, Trmm::kDiagType,
+                                  typename Trmm::ElementB, typename Trmm::LayoutB,
+                                  typename Trmm::ElementC, typename Trmm::LayoutC, 
+                                  ElementCompute,
+                                  ElementAccumulator>
+                           >::type;
+
+
+    HostReference reference_trmm;
+
+    reference_trmm(
+      problem_size,
+      alpha, 
+      tensor_A.host_ref(),
+      tensor_B.host_ref(),
+      reference_D.host_ref(), 
+      ElementAccumulator(0)
+    );
+
+    return compare_reference(problem_size, alpha);
+  }
+  
+  /// Returns true if the CUDA device is sufficient to execute the kernel.
+  bool sufficient() const {
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+
+    int smem_size = int(sizeof(typename Trmm::TrmmKernel::SharedStorage));
+
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    result = cudaGetDeviceProperties(&properties, device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    if (properties.sharedMemPerMultiprocessor < smem_size) {
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Executes one test
+  bool run(
+    cutlass::gemm::GemmUniversalMode mode,
+    cutlass::gemm::GemmCoord problem_size,
+    int batch_count = 1,
+    ElementCompute alpha = ElementCompute(1)) {
+
+    // Waive test if insufficient CUDA device
+    if (!sufficient()) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device." << std::endl;
+      }
+      return true;
+    }
+
+#if 0
+    std::cout << "[TestbedTrmmUniversal::run()] problem(m, n, k): " << problem_size
+              << " alpha: " << ElementCompute(alpha) << std::endl;
+#endif
+
+    this->initialize(problem_size);
+
+    //
+    // Initialize the TRMM operator
+    //
+
+    int batch_stride_A;
+    if (Trmm::kSideMode == cutlass::SideMode::kLeft)
+      batch_stride_A = problem_size.m()*problem_size.m();
+    if (Trmm::kSideMode == cutlass::SideMode::kRight)
+      batch_stride_A = problem_size.n()*problem_size.n();
+
+    typename Trmm::Arguments arguments{
+      mode,
+      problem_size,
+      batch_count,
+      {alpha},
+      tensor_A.device_data(),
+      tensor_B.device_data(),
+      tensor_D.device_data(),
+      batch_stride_A,
+      problem_size.m() * problem_size.n(),
+      problem_size.m() * problem_size.n(),
+      tensor_A.layout().stride(0),
+      tensor_B.layout().stride(0),
+      tensor_D.layout().stride(0)
+    };
+
+    Trmm trmm_op;
+
+    size_t workspace_size = Trmm::get_workspace_size(arguments);
+
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    cutlass::Status status = trmm_op.initialize(arguments, workspace.get());
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+
+    //
+    // Run the TRMM
+    //
+
+    status = trmm_op();
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+
+    //
+    // Verify
+    //
+    bool passed = this->verify(problem_size, alpha);
+
+    if (!passed) {
+      std::stringstream fname;
+
+      fname << "error_Trmm_device_"
+            << "fill_mode_"
+            << (Trmm::kFillMode == cutlass::FillMode::kLower ? "lower_" :
+                (Trmm::kFillMode == cutlass::FillMode::kUpper ? "upper_" : "invalid_"))
+            << "side_mode_"
+            << (Trmm::kSideMode == cutlass::SideMode::kLeft ? "left_" :
+                (Trmm::kSideMode == cutlass::SideMode::kRight ? "right_" : "invalid_")) 
+            << "mnk_"
+            << problem_size.m() << "x"
+            << problem_size.n() << "x"
+            << problem_size.k() << "_"
+            << Trmm::ThreadblockShape::kM << "x"  
+            << Trmm::ThreadblockShape::kN << "x"  
+            << Trmm::ThreadblockShape::kK << "_"
+            << Trmm::WarpShape::kM << "x"  
+            << Trmm::WarpShape::kN << "x"  
+            << Trmm::WarpShape::kK << ".txt";
+
+      std::cout << fname.str() << std::endl;
+
+      std::ofstream results(fname.str());
+
+      results << problem_size << std::endl;
+
+      results
+        << "\nA:\n" << tensor_A.host_view() << "\n"
+        << "\nB:\n" << tensor_B.host_view() << "\n"
+        << "\nD reference:\n" << reference_D.host_view() << "\n"
+        << "\nD computed:\n" << tensor_D.host_view() << "\n";
+    }
+
+    return passed;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename Trmm>
+bool TestTrmmUniversal(
+  cutlass::gemm::GemmCoord const & problem_size,
+  cutlass::gemm::GemmUniversalMode mode,
+  int batch_count,
+  double alpha = 1.0) {
+
+  bool passed = true;
+
+  TestbedTrmmUniversal<Trmm> testbed;
+  
+  using ElementCompute = typename Trmm::EpilogueOutputOp::ElementCompute;
+
+  passed = testbed.run(
+    mode,
+    problem_size,
+    batch_count,
+    cutlass::from_real<ElementCompute>(alpha) 
+  );
+
+  return passed;
+}
+
+template <typename Trmm>
+bool TestAllTrmmUniversal() {
+  bool passed = true;
+
+  int const kMinimumOperandElementSize = int(cutlass::sizeof_bits<typename Trmm::ElementA>::value);
+
+  int const kAlignment = cutlass::platform::is_same<
+                              typename Trmm::OperatorClass, 
+                              cutlass::arch::OpClassSimt>::value ? 1 : 128 / kMinimumOperandElementSize;
+
+  // int8_t gemm alignment constraints
+  int const kAlignmentM = cutlass::platform::is_same<typename Trmm::OperatorClass, cutlass::arch::OpClassSimt>::value &&
+                          cutlass::platform::is_same<typename Trmm::ElementA, int8_t>::value &&
+                          cutlass::platform::is_same<typename Trmm::LayoutA, cutlass::layout::ColumnMajor>::value ? 4 : kAlignment;
+
+  int const kAlignmentN = kAlignmentM;
+
+  int const kAlignmentK = cutlass::platform::is_same<typename Trmm::OperatorClass, cutlass::arch::OpClassSimt>::value &&
+                          cutlass::platform::is_same<typename Trmm::ElementA, int8_t>::value &&
+                          cutlass::platform::is_same<typename Trmm::LayoutA, cutlass::layout::RowMajor>::value
+                           ? 4 : kAlignment;
+
+  cutlass::gemm::GemmUniversalMode modes[] = {
+    cutlass::gemm::GemmUniversalMode::kGemm,
+  };
+
+  int problem_size_m[] = {
+    kAlignmentK, 
+    Trmm::ThreadblockShape::kK * Trmm::kStages - kAlignmentK, 
+    Trmm::ThreadblockShape::kK * Trmm::kStages * 3 - kAlignmentK
+  };
+
+  int problem_size_n[] = {
+    kAlignmentN, 512 - 2*kAlignmentN
+  };
+
+  int batch_counts[] = {      // may be interpretted as batch count or split-K slices
+    1                         // Just running one batch for now (removing 2, 3, 5, 7)
+  };
+
+  double problem_alpha[] = {
+    1.0, 2.0
+  };
+
+  using ElementCompute = typename Trmm::EpilogueOutputOp::ElementCompute;
+
+  for (cutlass::gemm::GemmUniversalMode mode : modes) {
+    for (int m : problem_size_m) {
+      for (int n : problem_size_n) {
+        for (int batch_count : batch_counts) {
+          for (auto alpha : problem_alpha) {
+            
+            int k = 0;
+            if (Trmm::kSideMode == cutlass::SideMode::kLeft)
+              k = m;
+            else if (Trmm::kSideMode == cutlass::SideMode::kRight)
+              k = n;
+
+            if (mode == cutlass::gemm::GemmUniversalMode::kGemm ||
+              mode == cutlass::gemm::GemmUniversalMode::kGemmSplitKParallel) {
+
+#if 0
+              // skip very small K problems
+              if (k / batch_count < 2 * Trmm::ThreadblockShape::kK) {
+                continue;
+              }
+#endif
+            }
+            
+            cutlass::gemm::GemmCoord problem_size(m, n, k);
+
+            TestbedTrmmUniversal<Trmm> testbed;
+
+            passed = testbed.run(
+              mode,
+              problem_size,
+              batch_count,
+              cutlass::from_real<ElementCompute>(alpha) 
+            );
+
+            if (!passed) {
+              return false;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return passed;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace test
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/test/unit/gemm/device/testbed_universal.h b/test/unit/gemm/device/testbed_universal.h
index 8a63cc7a..ca85e062 100644
--- a/test/unit/gemm/device/testbed_universal.h
+++ b/test/unit/gemm/device/testbed_universal.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/testbed_utils.h b/test/unit/gemm/device/testbed_utils.h
index 4ca6a750..b5b76616 100644
--- a/test/unit/gemm/device/testbed_utils.h
+++ b/test/unit/gemm/device/testbed_utils.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/device/trmm_cf32n_cf32n_cf32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/trmm_cf32n_cf32n_cf32t_tensor_op_f32_sm80.cu
new file mode 100644
index 00000000..f3029e62
--- /dev/null
+++ b/test/unit/gemm/device/trmm_cf32n_cf32n_cf32t_tensor_op_f32_sm80.cu
@@ -0,0 +1,305 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide TRMM interface
+
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/trmm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/trmm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_trmm_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_cf32n_cf32t_cf32t_ls_l_nu_tensor_op_f32, 32x32x16_16x16x16) {
+
+  using ElementOutput = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    cutlass::complex<float>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kNonUnit,
+    cutlass::complex<float>,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::ComplexTransform::kNone
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_cf32n_cf32t_cf32t_ls_u_nu_tensor_op_f32, 32x32x16_16x16x16) {
+
+  using ElementOutput = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    cutlass::complex<float>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    cutlass::complex<float>,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::ComplexTransform::kNone
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_cf32n_cf32t_cf32t_rs_u_nu_tensor_op_f32, 32x32x16_16x16x16) {
+
+  using ElementOutput = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    cutlass::complex<float>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    cutlass::complex<float>,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::ComplexTransform::kNone
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_cf32n_cf32t_cf32t_ls_l_un_tensor_op_f32, 32x32x16_16x16x16) {
+
+  using ElementOutput = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    cutlass::complex<float>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kUnit,
+    cutlass::complex<float>,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::ComplexTransform::kNone
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_cf32n_cf32t_cf32t_ls_u_un_tensor_op_f32, 32x32x16_16x16x16) {
+
+  using ElementOutput = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    cutlass::complex<float>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kUnit,
+    cutlass::complex<float>,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::ComplexTransform::kNone
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_cf32n_cf32t_cf32t_rs_u_un_tensor_op_f32, 32x32x16_16x16x16) {
+
+  using ElementOutput = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    cutlass::complex<float>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kUnit,
+    cutlass::complex<float>,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::ComplexTransform::kNone
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/trmm_cf32n_cf32n_cf32t_tensor_op_fast_f32_sm80.cu b/test/unit/gemm/device/trmm_cf32n_cf32n_cf32t_tensor_op_fast_f32_sm80.cu
new file mode 100644
index 00000000..44f11d66
--- /dev/null
+++ b/test/unit/gemm/device/trmm_cf32n_cf32n_cf32t_tensor_op_fast_f32_sm80.cu
@@ -0,0 +1,305 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide TRMM interface
+
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/trmm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/trmm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_trmm_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_cf32n_cf32t_cf32t_ls_l_nu_tensor_op_fast_f32, 32x32x16_16x16x16) {
+
+  using ElementOutput = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    cutlass::complex<float>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kNonUnit,
+    cutlass::complex<float>,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddComplexFastF32,
+    cutlass::ComplexTransform::kNone
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_cf32n_cf32t_cf32t_ls_u_nu_tensor_op_fast_f32, 32x32x16_16x16x16) {
+
+  using ElementOutput = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    cutlass::complex<float>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    cutlass::complex<float>,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddComplexFastF32,
+    cutlass::ComplexTransform::kNone
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_cf32n_cf32t_cf32t_rs_u_nu_tensor_op_fast_f32, 32x32x16_16x16x16) {
+
+  using ElementOutput = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    cutlass::complex<float>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    cutlass::complex<float>,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddComplexFastF32,
+    cutlass::ComplexTransform::kNone
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_cf32n_cf32t_cf32t_ls_l_un_tensor_op_fast_f32, 32x32x16_16x16x16) {
+
+  using ElementOutput = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    cutlass::complex<float>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kUnit,
+    cutlass::complex<float>,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddComplexFastF32,
+    cutlass::ComplexTransform::kNone
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_cf32n_cf32t_cf32t_ls_u_un_tensor_op_fast_f32, 32x32x16_16x16x16) {
+
+  using ElementOutput = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    cutlass::complex<float>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kUnit,
+    cutlass::complex<float>,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddComplexFastF32,
+    cutlass::ComplexTransform::kNone
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_cf32n_cf32t_cf32t_rs_u_un_tensor_op_fast_f32, 32x32x16_16x16x16) {
+
+  using ElementOutput = cutlass::complex<float>;
+  using ElementAccumulator = cutlass::complex<float>;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    cutlass::complex<float>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kUnit,
+    cutlass::complex<float>,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddComplexFastF32,
+    cutlass::ComplexTransform::kNone
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/trmm_cf64n_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu b/test/unit/gemm/device/trmm_cf64n_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu
new file mode 100644
index 00000000..e95ff748
--- /dev/null
+++ b/test/unit/gemm/device/trmm_cf64n_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu
@@ -0,0 +1,137 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide TRMM interface
+
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/trmm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/trmm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_trmm_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_cf64n_cf64n_cf64t_ls_u_nu_tensor_op_f64_gaussian, 32x32x16_16x16x16) {
+
+  using ElementOutput = cutlass::complex<double>;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddGaussianComplex,
+    cutlass::ComplexTransform::kNone
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_cf64h_cf64n_cf64t_ls_u_nu_tensor_op_f64_gaussian, 64x64x16_32x32x16) {
+
+  using ElementOutput = cutlass::complex<double>;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddGaussianComplex,
+    cutlass::ComplexTransform::kConjugate
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/trmm_cf64n_cf64n_cf64t_tensor_op_f64_sm80.cu b/test/unit/gemm/device/trmm_cf64n_cf64n_cf64t_tensor_op_f64_sm80.cu
new file mode 100644
index 00000000..0ca88291
--- /dev/null
+++ b/test/unit/gemm/device/trmm_cf64n_cf64n_cf64t_tensor_op_f64_sm80.cu
@@ -0,0 +1,301 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide TRMM interface
+
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/trmm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/trmm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_trmm_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_cf64n_cf64n_cf64t_ls_l_nu_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementOutput = cutlass::complex<double>;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kNonUnit,
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::ComplexTransform::kNone
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_cf64n_cf64n_cf64t_ls_u_nu_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementOutput = cutlass::complex<double>;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::ComplexTransform::kNone
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_cf64h_cf64n_cf64t_ls_u_nu_tensor_op_f64, 64x64x16_32x32x16) {
+
+  using ElementOutput = cutlass::complex<double>;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::ComplexTransform::kConjugate
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_cf64n_cf64n_cf64t_ls_l_un_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementOutput = cutlass::complex<double>;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kUnit,
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::ComplexTransform::kNone
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_cf64n_cf64n_cf64t_ls_u_un_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementOutput = cutlass::complex<double>;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kUnit,
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::ComplexTransform::kNone
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_cf64h_cf64n_cf64t_ls_u_un_tensor_op_f64, 64x64x16_32x32x16) {
+
+  using ElementOutput = cutlass::complex<double>;
+  using ElementAccumulator = cutlass::complex<double>;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kUnit,
+    cutlass::complex<double>,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddComplex,
+    cutlass::ComplexTransform::kConjugate
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/trmm_f32n_f32t_f32t_tensor_op_fast_f32_ls_sm80.cu b/test/unit/gemm/device/trmm_f32n_f32t_f32t_tensor_op_fast_f32_ls_sm80.cu
new file mode 100644
index 00000000..c31a2953
--- /dev/null
+++ b/test/unit/gemm/device/trmm_f32n_f32t_f32t_tensor_op_fast_f32_ls_sm80.cu
@@ -0,0 +1,500 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide TRMM interface
+
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/trmm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/trmm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_trmm_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////Test name//////////////////////////////////////////////////
+//                             
+// SM80_Device_Trmm_{ElementA}{LayoutA}_{ElementB}{LayoutB}_{ElementC}{LayoutC}_{SideMode}_{FillMode}\
+//    _{DiagType}_tensor_op_{ElementAccumulator}_align{AlignmentA}_align{AlignmentB}
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Trmm_f32n_f32t_f32t_ls_l_nu_tensor_op_fast_f32_align1_align1, 64x64x32_32x32x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kNonUnit,
+    float,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<32, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f32n_f32t_f32t_ls_l_nu_tensor_op_fast_f32_align1_align4, 64x64x32_32x32x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kNonUnit,
+    float,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<32, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f32n_f32t_f32t_ls_l_nu_tensor_op_fast_f32_align1_align4, 128x128x32_64x64x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kNonUnit,
+    float,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f32n_f32t_f32t_ls_l_nu_tensor_op_fast_f32_align1_align4, 256x128x16_64x64x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kNonUnit,
+    float,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 128, 16>,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f32n_f32t_f32t_ls_l_nu_tensor_op_fast_f32_align1_align4, 128x256x32_64x64x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kNonUnit,
+    float,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 256, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+  >;
+  
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f32n_f32t_f32t_ls_l_nu_tensor_op_fast_f32_align1_align4, 256x128x32_64x64x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kNonUnit,
+    float,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f32n_f32t_f32t_ls_u_nu_tensor_op_fast_f32_align1_align4, 64x64x16_32x32x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    float,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    10,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f32n_f32t_f32t_ls_u_nu_tensor_op_fast_f32_align1_align4, 128x128x16_64x64x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    float,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f32n_f32t_f32t_ls_u_nu_tensor_op_fast_f32_align1_align4, 256x128x16_128x64x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    float,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 128, 16>,
+    cutlass::gemm::GemmShape<128, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f32n_f32t_f32t_ls_u_nu_tensor_op_fast_f32_align1_align4, 128x256x16_64x128x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    float,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 256, 16>,
+    cutlass::gemm::GemmShape<64, 128, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+  >;
+  
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f32n_f32t_f32t_ls_u_nu_tensor_op_fast_f32_align1_align4, 256x256x16_64x128x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    float,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 256, 16>,
+    cutlass::gemm::GemmShape<64, 128, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/trmm_f32n_f32t_f32t_tensor_op_fast_f32_rs_sm80.cu b/test/unit/gemm/device/trmm_f32n_f32t_f32t_tensor_op_fast_f32_rs_sm80.cu
new file mode 100644
index 00000000..0d8867ef
--- /dev/null
+++ b/test/unit/gemm/device/trmm_f32n_f32t_f32t_tensor_op_fast_f32_rs_sm80.cu
@@ -0,0 +1,252 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide TRMM interface
+
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/trmm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/trmm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_trmm_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////Test name//////////////////////////////////////////////////
+//                             
+// SM80_Device_Trmm_{ElementA}{LayoutA}_{ElementB}{LayoutB}_{ElementC}{LayoutC}_{SideMode}_{FillMode}\
+//    _{DiagType}_tensor_op_{ElementAccumulator}_align{AlignmentA}_align{AlignmentB}
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f32n_f32t_f32t_rs_u_nu_tensor_op_fast_f32_align1_align1, 64x128x32_32x64x32) {
+
+using Trmm = cutlass::gemm::device::Trmm<
+    float, cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight, cutlass::FillMode::kUpper, cutlass::DiagType::kNonUnit,
+    float, cutlass::layout::RowMajor,
+    float, cutlass::layout::RowMajor,
+    float,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 32>,
+    cutlass::gemm::GemmShape<32, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      float,
+      1,
+      float,
+      float
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>,
+    3,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f32n_f32t_f32t_rs_u_nu_tensor_op_fast_f32_align1_align1, 128x64x32_32x64x32) {
+
+using Trmm = cutlass::gemm::device::Trmm<
+    float, cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight, cutlass::FillMode::kUpper, cutlass::DiagType::kNonUnit,
+    float, cutlass::layout::RowMajor,
+    float, cutlass::layout::RowMajor,
+    float,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 32>,
+    cutlass::gemm::GemmShape<32, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      float,
+      1,
+      float,
+      float
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>,
+    3,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f32n_f32t_f32t_rs_l_nu_tensor_op_fast_f32_align1_align1, 64x128x32_32x64x32) {
+
+using Trmm = cutlass::gemm::device::Trmm<
+    float, cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight, cutlass::FillMode::kLower, cutlass::DiagType::kNonUnit,
+    float, cutlass::layout::RowMajor,
+    float, cutlass::layout::RowMajor,
+    float,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 32>,
+    cutlass::gemm::GemmShape<32, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      float,
+      1,
+      float,
+      float
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>,
+    3,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+TEST(SM80_Device_Trmm_f32n_f32t_f32t_rs_u_nu_tensor_op_fast_f32_align1_align4, 64x128x32_32x64x32) {
+
+using Trmm = cutlass::gemm::device::Trmm<
+    float, cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight, cutlass::FillMode::kUpper, cutlass::DiagType::kNonUnit,
+    float, cutlass::layout::RowMajor,
+    float, cutlass::layout::RowMajor,
+    float,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 32>,
+    cutlass::gemm::GemmShape<32, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      float,
+      1,
+      float,
+      float
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f32n_f32t_f32t_rs_u_nu_tensor_op_fast_f32_align1_align4, 128x64x32_32x64x32) {
+
+using Trmm = cutlass::gemm::device::Trmm<
+    float, cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight, cutlass::FillMode::kUpper, cutlass::DiagType::kNonUnit,
+    float, cutlass::layout::RowMajor,
+    float, cutlass::layout::RowMajor,
+    float,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 32>,
+    cutlass::gemm::GemmShape<32, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      float,
+      1,
+      float,
+      float
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f32n_f32t_f32t_rs_l_nu_tensor_op_fast_f32_align1_align4, 64x128x32_32x64x32) {
+
+using Trmm = cutlass::gemm::device::Trmm<
+    float, cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight, cutlass::FillMode::kLower, cutlass::DiagType::kNonUnit,
+    float, cutlass::layout::RowMajor,
+    float, cutlass::layout::RowMajor,
+    float,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 32>,
+    cutlass::gemm::GemmShape<32, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      float,
+      1,
+      float,
+      float
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/trmm_f32t_f32n_f32n_tensor_op_fast_f32_ls_sm80.cu b/test/unit/gemm/device/trmm_f32t_f32n_f32n_tensor_op_fast_f32_ls_sm80.cu
new file mode 100644
index 00000000..e165ba87
--- /dev/null
+++ b/test/unit/gemm/device/trmm_f32t_f32n_f32n_tensor_op_fast_f32_ls_sm80.cu
@@ -0,0 +1,449 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide TRMM interface
+
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/trmm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/trmm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_trmm_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f32t_f32n_f32n_ls_l_nu_tensor_op_fast_f32_align1_align1, 64x128x32_32x64x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float, cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft, cutlass::FillMode::kLower, cutlass::DiagType::kNonUnit,
+    float, cutlass::layout::ColumnMajor,
+    float, cutlass::layout::ColumnMajor,
+    float,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 32>,
+    cutlass::gemm::GemmShape<32, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      float,
+      1,
+      float,
+      float
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>,
+    3,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f32t_f32n_f32n_ls_l_nu_tensor_op_fast_f32_align1_align4, 128x128x32_64x64x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kNonUnit,
+    float,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Trmm_f32t_f32n_f32n_ls_l_nu_tensor_op_fast_f32_align1_align4, 256x128x16_64x64x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kNonUnit,
+    float,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 128, 16>,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f32t_f32n_f32n_ls_l_nu_tensor_op_fast_f32_align1_align4, 128x256x32_64x64x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kNonUnit,
+    float,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 256, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+  >;
+  
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f32t_f32n_f32n_ls_l_nu_tensor_op_fast_f32_align1_align4, 256x128x32_64x64x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kNonUnit,
+    float,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f32t_f32n_f32n_ls_u_nu_tensor_op_fast_f32_align1_align4, 64x64x16_32x32x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    float,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    10,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f32t_f32n_f32n_ls_u_nu_tensor_op_fast_f32_align1_align4, 128x128x16_64x64x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    float,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f32t_f32n_f32n_ls_u_nu_tensor_op_fast_f32_align1_align4, 256x128x16_128x64x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    float,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 128, 16>,
+    cutlass::gemm::GemmShape<128, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f32t_f32n_f32n_ls_u_nu_tensor_op_fast_f32_align1_align4, 128x256x16_64x128x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    float,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 256, 16>,
+    cutlass::gemm::GemmShape<64, 128, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+  >;
+  
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f32t_f32n_f32n_ls_u_nu_tensor_op_fast_f32_align1_align4, 256x256x16_64x128x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    float,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 256, 16>,
+    cutlass::gemm::GemmShape<64, 128, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/trmm_f32t_f32n_f32t_tensor_op_fast_f32_ls_sm80.cu b/test/unit/gemm/device/trmm_f32t_f32n_f32t_tensor_op_fast_f32_ls_sm80.cu
new file mode 100644
index 00000000..89281d6a
--- /dev/null
+++ b/test/unit/gemm/device/trmm_f32t_f32n_f32t_tensor_op_fast_f32_ls_sm80.cu
@@ -0,0 +1,458 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide TRMM interface
+
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/trmm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/trmm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_trmm_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////Test name//////////////////////////////////////////////////
+//                             
+// SM80_Device_Trmm_{ElementA}{LayoutA}_{ElementB}{LayoutB}_{ElementC}{LayoutC}_{SideMode}_{FillMode}\
+//    _{DiagType}_tensor_op_{ElementAccumulator}_align{AlignmentA}_align{AlignmentB}
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Trmm_f32t_f32n_f32t_ls_l_un_tensor_op_fast_f32_align1_align1, 64x64x32_32x32x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kUnit,
+    float,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<32, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f32t_f32n_f32t_ls_l_un_tensor_op_fast_f32_align1_align4, 64x64x32_32x32x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kUnit,
+    float,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<32, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Trmm_f32t_f32n_f32t_ls_l_un_tensor_op_fast_f32_align1_align4, 256x128x16_64x64x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kUnit,
+    float,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 128, 16>,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f32t_f32n_f32t_ls_l_un_tensor_op_fast_f32_align1_align4, 128x256x32_64x64x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kUnit,
+    float,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 256, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+  >;
+  
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f32t_f32n_f32t_ls_l_nu_tensor_op_fast_f32_align1_align4, 256x128x32_64x64x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kNonUnit,
+    float,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f32t_f32n_f32t_ls_u_nu_tensor_op_fast_f32_align1_align4, 64x64x16_32x32x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    float,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    10,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f32t_f32n_f32t_ls_u_un_tensor_op_fast_f32_align1_align4, 128x128x16_64x64x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kUnit,
+    float,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f32t_f32n_f32t_ls_u_nu_tensor_op_fast_f32_align1_align4, 256x128x16_128x64x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    float,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 128, 16>,
+    cutlass::gemm::GemmShape<128, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f32t_f32n_f32t_ls_u_nu_tensor_op_fast_f32_align1_align4, 128x256x16_64x128x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    float,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 256, 16>,
+    cutlass::gemm::GemmShape<64, 128, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+  >;
+  
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f32t_f32n_f32t_ls_u_nu_tensor_op_fast_f32_align1_align4, 256x256x16_64x128x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    float,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 256, 16>,
+    cutlass::gemm::GemmShape<64, 128, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAddFastF32
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/trmm_f64n_f64n_f64t_tensor_op_f64_ls_sm80.cu b/test/unit/gemm/device/trmm_f64n_f64n_f64t_tensor_op_f64_ls_sm80.cu
new file mode 100644
index 00000000..aa6706f3
--- /dev/null
+++ b/test/unit/gemm/device/trmm_f64n_f64n_f64t_tensor_op_f64_ls_sm80.cu
@@ -0,0 +1,414 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide TRMM interface
+
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/trmm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/trmm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_trmm_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f64n_f64n_f64t_ls_l_nu_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    double,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kNonUnit,
+    double,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f64n_f64n_f64t_ls_l_nu_tensor_op_f64, 64x64x16_32x32x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    double,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kNonUnit,
+    double,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f64n_f64n_f64t_ls_l_nu_tensor_op_f64, 128x64x16_64x32x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    double,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kNonUnit,
+    double,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 16>,
+    cutlass::gemm::GemmShape<64, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f64n_f64n_f64t_ls_l_nu_tensor_op_f64, 64x128x16_32x64x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    double,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kNonUnit,
+    double,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 16>,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+  
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f64n_f64n_f64t_ls_l_nu_tensor_op_f64, 128x128x16_32x64x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    double,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kNonUnit,
+    double,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f64n_f64n_f64t_ls_u_nu_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    double,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    double,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f64n_f64n_f64t_ls_u_nu_tensor_op_f64, 64x64x16_32x32x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    double,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    double,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f64n_f64n_f64t_ls_u_nu_tensor_op_f64, 128x64x16_64x32x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    double,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    double,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 16>,
+    cutlass::gemm::GemmShape<64, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f64n_f64n_f64t_ls_u_nu_tensor_op_f64, 64x128x16_32x64x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    double,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    double,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 16>,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+  
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f64n_f64n_f64t_ls_u_nu_tensor_op_f64, 128x128x16_32x64x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    double,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    double,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/trmm_f64n_f64n_f64t_tensor_op_f64_rs_sm80.cu b/test/unit/gemm/device/trmm_f64n_f64n_f64t_tensor_op_f64_rs_sm80.cu
new file mode 100644
index 00000000..463e93fc
--- /dev/null
+++ b/test/unit/gemm/device/trmm_f64n_f64n_f64t_tensor_op_f64_rs_sm80.cu
@@ -0,0 +1,415 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide TRMM interface
+
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/trmm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/trmm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_trmm_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f64n_f64n_f64t_rs_l_nu_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    double,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kNonUnit,
+    double,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f64n_f64n_f64t_rs_l_nu_tensor_op_f64, 64x64x16_32x32x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    double,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kNonUnit,
+    double,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f64n_f64n_f64t_rs_l_nu_tensor_op_f64, 128x64x16_64x32x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    double,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kNonUnit,
+    double,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 16>,
+    cutlass::gemm::GemmShape<64, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f64n_f64n_f64t_rs_l_nu_tensor_op_f64, 64x128x16_32x64x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    double,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kNonUnit,
+    double,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 16>,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+  
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f64n_f64n_f64t_rs_l_nu_tensor_op_f64, 128x128x16_32x64x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    double,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kNonUnit,
+    double,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f64n_f64n_f64t_rs_u_nu_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    double,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    double,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f64n_f64n_f64t_rs_u_nu_tensor_op_f64, 64x64x16_32x32x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    double,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    double,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f64n_f64n_f64t_rs_u_nu_tensor_op_f64, 128x64x16_64x32x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    double,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    double,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 16>,
+    cutlass::gemm::GemmShape<64, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f64n_f64n_f64t_rs_u_nu_tensor_op_f64, 64x128x16_32x64x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    double,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    double,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 16>,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+  
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f64n_f64n_f64t_rs_u_nu_tensor_op_f64, 128x128x16_32x64x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    double,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    double,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/trmm_f64n_f64t_f64t_tensor_op_f64_rs_sm80.cu b/test/unit/gemm/device/trmm_f64n_f64t_f64t_tensor_op_f64_rs_sm80.cu
new file mode 100644
index 00000000..4202e02d
--- /dev/null
+++ b/test/unit/gemm/device/trmm_f64n_f64t_f64t_tensor_op_f64_rs_sm80.cu
@@ -0,0 +1,415 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide TRMM interface
+
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/trmm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/trmm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_trmm_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f64n_f64t_f64t_rs_l_un_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    double,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kUnit,
+    double,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f64n_f64t_f64t_rs_l_un_tensor_op_f64, 64x64x16_32x32x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    double,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kUnit,
+    double,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f64n_f64t_f64t_rs_l_un_tensor_op_f64, 128x64x16_64x32x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    double,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kUnit,
+    double,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 16>,
+    cutlass::gemm::GemmShape<64, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f64n_f64t_f64t_rs_l_un_tensor_op_f64, 64x128x16_32x64x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    double,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kUnit,
+    double,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 16>,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+  
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f64n_f64t_f64t_rs_l_nu_tensor_op_f64, 128x128x16_32x64x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    double,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kNonUnit,
+    double,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f64n_f64t_f64t_rs_u_un_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    double,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kUnit,
+    double,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f64n_f64t_f64t_rs_u_nu_tensor_op_f64, 64x64x16_32x32x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    double,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    double,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f64n_f64t_f64t_rs_u_nu_tensor_op_f64, 128x64x16_64x32x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    double,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    double,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 16>,
+    cutlass::gemm::GemmShape<64, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f64n_f64t_f64t_rs_u_nu_tensor_op_f64, 64x128x16_32x64x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    double,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    double,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 16>,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+  
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f64n_f64t_f64t_rs_u_nu_tensor_op_f64, 128x128x16_32x64x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    double,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    double,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/trmm_f64t_f64t_f64n_tensor_op_f64_ls_sm80.cu b/test/unit/gemm/device/trmm_f64t_f64t_f64n_tensor_op_f64_ls_sm80.cu
new file mode 100644
index 00000000..5fda7ccc
--- /dev/null
+++ b/test/unit/gemm/device/trmm_f64t_f64t_f64n_tensor_op_f64_ls_sm80.cu
@@ -0,0 +1,414 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide TRMM interface
+
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/trmm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/trmm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_trmm_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f64t_f64t_f64n_ls_l_nu_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    double,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kNonUnit,
+    double,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f64t_f64t_f64n_ls_l_nu_tensor_op_f64, 64x64x16_32x32x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    double,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kNonUnit,
+    double,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f64t_f64t_f64n_ls_l_nu_tensor_op_f64, 128x64x16_64x32x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    double,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kNonUnit,
+    double,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 16>,
+    cutlass::gemm::GemmShape<64, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f64t_f64t_f64n_ls_l_nu_tensor_op_f64, 64x128x16_32x64x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    double,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kNonUnit,
+    double,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 16>,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+  
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f64t_f64t_f64n_ls_l_nu_tensor_op_f64, 128x128x16_32x64x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    double,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kNonUnit,
+    double,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f64t_f64t_f64n_ls_u_nu_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    double,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    double,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f64t_f64t_f64n_ls_u_nu_tensor_op_f64, 64x64x16_32x32x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    double,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    double,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f64t_f64t_f64n_ls_u_nu_tensor_op_f64, 128x64x16_64x32x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    double,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    double,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 16>,
+    cutlass::gemm::GemmShape<64, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f64t_f64t_f64n_ls_u_nu_tensor_op_f64, 64x128x16_32x64x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    double,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    double,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 16>,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+  
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f64t_f64t_f64n_ls_u_nu_tensor_op_f64, 128x128x16_32x64x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    double,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    double,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/trmm_f64t_f64t_f64n_tensor_op_f64_rs_sm80.cu b/test/unit/gemm/device/trmm_f64t_f64t_f64n_tensor_op_f64_rs_sm80.cu
new file mode 100644
index 00000000..c7d2bf33
--- /dev/null
+++ b/test/unit/gemm/device/trmm_f64t_f64t_f64n_tensor_op_f64_rs_sm80.cu
@@ -0,0 +1,415 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide TRMM interface
+
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/trmm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/trmm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_trmm_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f64t_f64t_f64n_rs_l_nu_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    double,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kNonUnit,
+    double,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f64t_f64t_f64n_rs_l_nu_tensor_op_f64, 64x64x16_32x32x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    double,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kNonUnit,
+    double,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f64t_f64t_f64n_rs_l_nu_tensor_op_f64, 128x64x16_64x32x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    double,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kNonUnit,
+    double,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 16>,
+    cutlass::gemm::GemmShape<64, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f64t_f64t_f64n_rs_l_nu_tensor_op_f64, 64x128x16_32x64x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    double,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kNonUnit,
+    double,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 16>,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+  
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f64t_f64t_f64n_rs_l_nu_tensor_op_f64, 128x128x16_32x64x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    double,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kNonUnit,
+    double,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f64t_f64t_f64n_rs_u_nu_tensor_op_f64, 32x32x16_16x16x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    double,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    double,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 16, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f64t_f64t_f64n_rs_u_nu_tensor_op_f64, 64x64x16_32x32x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    double,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    double,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f64t_f64t_f64n_rs_u_nu_tensor_op_f64, 128x64x16_64x32x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    double,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    double,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 16>,
+    cutlass::gemm::GemmShape<64, 32, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f64t_f64t_f64n_rs_u_nu_tensor_op_f64, 64x128x16_32x64x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    double,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    double,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 16>,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+  
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_f64t_f64t_f64n_rs_u_nu_tensor_op_f64, 128x128x16_32x64x16) {
+
+  using ElementOutput = double;
+  using ElementAccumulator = double;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    double,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kRight,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    double,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<32, 64, 16>,
+    cutlass::gemm::GemmShape<8, 8, 4>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/trmm_tf32n_tf32t_f32t_tensor_op_f32_ls_sm80.cu b/test/unit/gemm/device/trmm_tf32n_tf32t_f32t_tensor_op_f32_ls_sm80.cu
new file mode 100644
index 00000000..a56c45d6
--- /dev/null
+++ b/test/unit/gemm/device/trmm_tf32n_tf32t_f32t_tensor_op_f32_ls_sm80.cu
@@ -0,0 +1,500 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide TRMM interface
+
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/trmm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/trmm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_trmm_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////Test name//////////////////////////////////////////////////
+//                             
+// SM80_Device_Trmm_{ElementA}{LayoutA}_{ElementB}{LayoutB}_{ElementC}{LayoutC}_{SideMode}_{FillMode}\
+//    _{DiagType}_tensor_op_{ElementAccumulator}_align{AlignmentA}_align{AlignmentB}
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Trmm_tf32n_tf32t_f32t_ls_l_nu_tensor_op_f32_align1_align1, 64x64x32_32x32x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kNonUnit,
+    float,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<32, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAdd
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_tf32n_tf32t_f32t_ls_l_nu_tensor_op_f32_align1_align4, 64x64x32_32x32x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kNonUnit,
+    float,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<32, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAdd
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_tf32n_tf32t_f32t_ls_l_nu_tensor_op_f32_align1_align4, 128x128x32_64x64x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kNonUnit,
+    float,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAdd
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_tf32n_tf32t_f32t_ls_l_nu_tensor_op_f32_align1_align4, 256x128x16_64x64x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kNonUnit,
+    float,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 128, 16>,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAdd
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_tf32n_tf32t_f32t_ls_l_nu_tensor_op_f32_align1_align4, 128x256x32_64x64x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kNonUnit,
+    float,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 256, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAdd
+  >;
+  
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_tf32n_tf32t_f32t_ls_l_nu_tensor_op_f32_align1_align4, 256x128x32_64x64x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kNonUnit,
+    float,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAdd
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_tf32n_tf32t_f32t_ls_u_nu_tensor_op_f32_align1_align4, 64x64x16_32x32x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    float,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    10,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAdd
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_tf32n_tf32t_f32t_ls_u_nu_tensor_op_f32_align1_align4, 128x128x16_64x64x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    float,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAdd
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_tf32n_tf32t_f32t_ls_u_nu_tensor_op_f32_align1_align4, 256x128x16_128x64x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    float,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 128, 16>,
+    cutlass::gemm::GemmShape<128, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAdd
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_tf32n_tf32t_f32t_ls_u_nu_tensor_op_f32_align1_align4, 128x256x16_64x128x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    float,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 256, 16>,
+    cutlass::gemm::GemmShape<64, 128, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAdd
+  >;
+  
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_tf32n_tf32t_f32t_ls_u_nu_tensor_op_f32_align1_align4, 256x256x16_64x128x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float,
+    cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    float,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 256, 16>,
+    cutlass::gemm::GemmShape<64, 128, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAdd
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/trmm_tf32n_tf32t_f32t_tensor_op_f32_rs_sm80.cu b/test/unit/gemm/device/trmm_tf32n_tf32t_f32t_tensor_op_f32_rs_sm80.cu
new file mode 100644
index 00000000..7ffdc202
--- /dev/null
+++ b/test/unit/gemm/device/trmm_tf32n_tf32t_f32t_tensor_op_f32_rs_sm80.cu
@@ -0,0 +1,252 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide TRMM interface
+
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/trmm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/trmm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_trmm_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////Test name//////////////////////////////////////////////////
+//                             
+// SM80_Device_Trmm_{ElementA}{LayoutA}_{ElementB}{LayoutB}_{ElementC}{LayoutC}_{SideMode}_{FillMode}\
+//    _{DiagType}_tensor_op_{ElementAccumulator}_align{AlignmentA}_align{AlignmentB}
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_tf32n_tf32t_f32t_rs_u_nu_tensor_op_f32_align1_align1, 64x128x32_32x64x32) {
+
+using Trmm = cutlass::gemm::device::Trmm<
+    float, cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight, cutlass::FillMode::kUpper, cutlass::DiagType::kNonUnit,
+    float, cutlass::layout::RowMajor,
+    float, cutlass::layout::RowMajor,
+    float,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 32>,
+    cutlass::gemm::GemmShape<32, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      float,
+      1,
+      float,
+      float
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>,
+    3,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAdd
+>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_tf32n_tf32t_f32t_rs_u_nu_tensor_op_f32_align1_align1, 128x64x32_32x64x32) {
+
+using Trmm = cutlass::gemm::device::Trmm<
+    float, cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight, cutlass::FillMode::kUpper, cutlass::DiagType::kNonUnit,
+    float, cutlass::layout::RowMajor,
+    float, cutlass::layout::RowMajor,
+    float,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 32>,
+    cutlass::gemm::GemmShape<32, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      float,
+      1,
+      float,
+      float
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>,
+    3,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAdd
+>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_tf32n_tf32t_f32t_rs_l_nu_tensor_op_f32_align1_align1, 64x128x32_32x64x32) {
+
+using Trmm = cutlass::gemm::device::Trmm<
+    float, cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight, cutlass::FillMode::kLower, cutlass::DiagType::kNonUnit,
+    float, cutlass::layout::RowMajor,
+    float, cutlass::layout::RowMajor,
+    float,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 32>,
+    cutlass::gemm::GemmShape<32, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      float,
+      1,
+      float,
+      float
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>,
+    3,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAdd
+>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+TEST(SM80_Device_Trmm_tf32n_tf32t_f32t_rs_u_nu_tensor_op_f32_align1_align4, 64x128x32_32x64x32) {
+
+using Trmm = cutlass::gemm::device::Trmm<
+    float, cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight, cutlass::FillMode::kUpper, cutlass::DiagType::kNonUnit,
+    float, cutlass::layout::RowMajor,
+    float, cutlass::layout::RowMajor,
+    float,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 32>,
+    cutlass::gemm::GemmShape<32, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      float,
+      1,
+      float,
+      float
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAdd
+>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_tf32n_tf32t_f32t_rs_u_nu_tensor_op_f32_align1_align4, 128x64x32_32x64x32) {
+
+using Trmm = cutlass::gemm::device::Trmm<
+    float, cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight, cutlass::FillMode::kUpper, cutlass::DiagType::kNonUnit,
+    float, cutlass::layout::RowMajor,
+    float, cutlass::layout::RowMajor,
+    float,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 64, 32>,
+    cutlass::gemm::GemmShape<32, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      float,
+      1,
+      float,
+      float
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAdd
+>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_tf32n_tf32t_f32t_rs_l_nu_tensor_op_f32_align1_align4, 64x128x32_32x64x32) {
+
+using Trmm = cutlass::gemm::device::Trmm<
+    float, cutlass::layout::ColumnMajor,
+    cutlass::SideMode::kRight, cutlass::FillMode::kLower, cutlass::DiagType::kNonUnit,
+    float, cutlass::layout::RowMajor,
+    float, cutlass::layout::RowMajor,
+    float,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 32>,
+    cutlass::gemm::GemmShape<32, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      float,
+      1,
+      float,
+      float
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAdd
+>;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/trmm_tf32t_tf32n_f32n_tensor_op_f32_ls_sm80.cu b/test/unit/gemm/device/trmm_tf32t_tf32n_f32n_tensor_op_f32_ls_sm80.cu
new file mode 100644
index 00000000..1b042afb
--- /dev/null
+++ b/test/unit/gemm/device/trmm_tf32t_tf32n_f32n_tensor_op_f32_ls_sm80.cu
@@ -0,0 +1,449 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide TRMM interface
+
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/trmm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/trmm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_trmm_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_tf32t_tf32n_f32n_ls_l_nu_tensor_op_f32_align1_align1, 64x128x32_32x64x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float, cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft, cutlass::FillMode::kLower, cutlass::DiagType::kNonUnit,
+    float, cutlass::layout::ColumnMajor,
+    float, cutlass::layout::ColumnMajor,
+    float,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 128, 32>,
+    cutlass::gemm::GemmShape<32, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      float,
+      1,
+      float,
+      float
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>,
+    3,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAdd
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_tf32t_tf32n_f32n_ls_l_nu_tensor_op_f32_align1_align4, 128x128x32_64x64x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kNonUnit,
+    float,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAdd
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Trmm_tf32t_tf32n_f32n_ls_l_nu_tensor_op_f32_align1_align4, 256x128x16_64x64x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kNonUnit,
+    float,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 128, 16>,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAdd
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_tf32t_tf32n_f32n_ls_l_nu_tensor_op_f32_align1_align4, 128x256x32_64x64x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kNonUnit,
+    float,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 256, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAdd
+  >;
+  
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_tf32t_tf32n_f32n_ls_l_nu_tensor_op_f32_align1_align4, 256x128x32_64x64x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kNonUnit,
+    float,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAdd
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_tf32t_tf32n_f32n_ls_u_nu_tensor_op_f32_align1_align4, 64x64x16_32x32x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    float,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    10,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAdd
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_tf32t_tf32n_f32n_ls_u_nu_tensor_op_f32_align1_align4, 128x128x16_64x64x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    float,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAdd
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_tf32t_tf32n_f32n_ls_u_nu_tensor_op_f32_align1_align4, 256x128x16_128x64x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    float,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 128, 16>,
+    cutlass::gemm::GemmShape<128, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAdd
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_tf32t_tf32n_f32n_ls_u_nu_tensor_op_f32_align1_align4, 128x256x16_64x128x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    float,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 256, 16>,
+    cutlass::gemm::GemmShape<64, 128, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAdd
+  >;
+  
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_tf32t_tf32n_f32n_ls_u_nu_tensor_op_f32_align1_align4, 256x256x16_64x128x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    float,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::ColumnMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 256, 16>,
+    cutlass::gemm::GemmShape<64, 128, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAdd
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/device/trmm_tf32t_tf32n_f32t_tensor_op_f32_ls_sm80.cu b/test/unit/gemm/device/trmm_tf32t_tf32n_f32t_tensor_op_f32_ls_sm80.cu
new file mode 100644
index 00000000..185dda59
--- /dev/null
+++ b/test/unit/gemm/device/trmm_tf32t_tf32n_f32t_tensor_op_f32_ls_sm80.cu
@@ -0,0 +1,458 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide TRMM interface
+
+  
+*/
+
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/device/trmm.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/trmm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_trmm_universal.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+////////////////////////////////////////////Test name//////////////////////////////////////////////////
+//                             
+// SM80_Device_Trmm_{ElementA}{LayoutA}_{ElementB}{LayoutB}_{ElementC}{LayoutC}_{SideMode}_{FillMode}\
+//    _{DiagType}_tensor_op_{ElementAccumulator}_align{AlignmentA}_align{AlignmentB}
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Trmm_tf32t_tf32n_f32t_ls_l_un_tensor_op_f32_align1_align1, 64x64x32_32x32x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kUnit,
+    float,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<32, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    1,
+    false,
+    cutlass::arch::OpMultiplyAdd
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_tf32t_tf32n_f32t_ls_l_un_tensor_op_f32_align1_align4, 64x64x32_32x32x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kUnit,
+    float,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<32, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAdd
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+TEST(SM80_Device_Trmm_tf32t_tf32n_f32t_ls_l_un_tensor_op_f32_align1_align4, 256x128x16_64x64x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kUnit,
+    float,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 128, 16>,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAdd
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_tf32t_tf32n_f32t_ls_l_un_tensor_op_f32_align1_align4, 128x256x32_64x64x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kUnit,
+    float,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 256, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAdd
+  >;
+  
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_tf32t_tf32n_f32t_ls_l_nu_tensor_op_f32_align1_align4, 256x128x32_64x64x32) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kLower,
+    cutlass::DiagType::kNonUnit,
+    float,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 128, 32>,
+    cutlass::gemm::GemmShape<64, 64, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAdd
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_tf32t_tf32n_f32t_ls_u_nu_tensor_op_f32_align1_align4, 64x64x16_32x32x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    float,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<32, 32, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    10,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAdd
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_tf32t_tf32n_f32t_ls_u_un_tensor_op_f32_align1_align4, 128x128x16_64x64x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kUnit,
+    float,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 128, 16>,
+    cutlass::gemm::GemmShape<64, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAdd
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_tf32t_tf32n_f32t_ls_u_nu_tensor_op_f32_align1_align4, 256x128x16_128x64x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    float,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 128, 16>,
+    cutlass::gemm::GemmShape<128, 64, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    4,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAdd
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_tf32t_tf32n_f32t_ls_u_nu_tensor_op_f32_align1_align4, 128x256x16_64x128x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    float,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<128, 256, 16>,
+    cutlass::gemm::GemmShape<64, 128, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAdd
+  >;
+  
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_Device_Trmm_tf32t_tf32n_f32t_ls_u_nu_tensor_op_f32_align1_align4, 256x256x16_64x128x16) {
+
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+
+  using Trmm = cutlass::gemm::device::Trmm<
+    float,
+    cutlass::layout::RowMajor,
+    cutlass::SideMode::kLeft,
+    cutlass::FillMode::kUpper,
+    cutlass::DiagType::kNonUnit,
+    float,
+    cutlass::layout::ColumnMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    cutlass::gemm::GemmShape<256, 256, 16>,
+    cutlass::gemm::GemmShape<64, 128, 16>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementAccumulator
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    3,
+    1,
+    4,
+    false,
+    cutlass::arch::OpMultiplyAdd
+  >;
+
+  EXPECT_TRUE(test::gemm::device::TestAllTrmmUniversal<Trmm>());
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
diff --git a/test/unit/gemm/kernel/batched_gemv.cu b/test/unit/gemm/kernel/batched_gemv.cu
index 3f340948..f3fc2d97 100755
--- a/test/unit/gemm/kernel/batched_gemv.cu
+++ b/test/unit/gemm/kernel/batched_gemv.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/kernel/testbed_gemv.h b/test/unit/gemm/kernel/testbed_gemv.h
index fe77440d..17656d84 100755
--- a/test/unit/gemm/kernel/testbed_gemv.h
+++ b/test/unit/gemm/kernel/testbed_gemv.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/thread/CMakeLists.txt b/test/unit/gemm/thread/CMakeLists.txt
index 0176fbad..e63cdb74 100644
--- a/test/unit/gemm/thread/CMakeLists.txt
+++ b/test/unit/gemm/thread/CMakeLists.txt
@@ -1,23 +1,29 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 cutlass_test_unit_add_executable(
diff --git a/test/unit/gemm/thread/gemm_sm50.cu b/test/unit/gemm/thread/gemm_sm50.cu
index 85ddb324..290eaaf2 100644
--- a/test/unit/gemm/thread/gemm_sm50.cu
+++ b/test/unit/gemm/thread/gemm_sm50.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/thread/gemm_sm60.cu b/test/unit/gemm/thread/gemm_sm60.cu
index 4c051033..c01c75b4 100644
--- a/test/unit/gemm/thread/gemm_sm60.cu
+++ b/test/unit/gemm/thread/gemm_sm60.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/thread/gemm_sm61.cu b/test/unit/gemm/thread/gemm_sm61.cu
index 7b3791d3..c84ebd1f 100644
--- a/test/unit/gemm/thread/gemm_sm61.cu
+++ b/test/unit/gemm/thread/gemm_sm61.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/thread/host/CMakeLists.txt b/test/unit/gemm/thread/host/CMakeLists.txt
index 3294dc0d..2d9b83cf 100644
--- a/test/unit/gemm/thread/host/CMakeLists.txt
+++ b/test/unit/gemm/thread/host/CMakeLists.txt
@@ -1,23 +1,29 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 cutlass_test_unit_add_executable(
diff --git a/test/unit/gemm/thread/host/gemm_sm60_host.cu b/test/unit/gemm/thread/host/gemm_sm60_host.cu
index 8535acdd..e54069eb 100644
--- a/test/unit/gemm/thread/host/gemm_sm60_host.cu
+++ b/test/unit/gemm/thread/host/gemm_sm60_host.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/thread/host/testbed_host.h b/test/unit/gemm/thread/host/testbed_host.h
index b8546e91..8e474acb 100644
--- a/test/unit/gemm/thread/host/testbed_host.h
+++ b/test/unit/gemm/thread/host/testbed_host.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/thread/testbed.h b/test/unit/gemm/thread/testbed.h
index 76a31ebe..db8d0cc4 100644
--- a/test/unit/gemm/thread/testbed.h
+++ b/test/unit/gemm/thread/testbed.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/threadblock/CMakeLists.txt b/test/unit/gemm/threadblock/CMakeLists.txt
index 070da809..b77a2e63 100644
--- a/test/unit/gemm/threadblock/CMakeLists.txt
+++ b/test/unit/gemm/threadblock/CMakeLists.txt
@@ -1,23 +1,29 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 cutlass_test_unit_add_executable(
@@ -25,6 +31,8 @@ cutlass_test_unit_add_executable(
   mma_multistage.cu
   mma_multistage_sparse.cu
   mma_pipelined_sm80.cu
+  mma_multistage_slicedk.cu
+  mma_pipelined_slicedk.cu
   mma_pipelined_wmma_sm70.cu
   mma_pipelined_wmma_sm75.cu
   mma_singlestage_wmma_sm70.cu
diff --git a/test/unit/gemm/threadblock/batched_gemv.cu b/test/unit/gemm/threadblock/batched_gemv.cu
index 5a36e284..78d8e50e 100644
--- a/test/unit/gemm/threadblock/batched_gemv.cu
+++ b/test/unit/gemm/threadblock/batched_gemv.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/threadblock/epilogue_workspace.cu b/test/unit/gemm/threadblock/epilogue_workspace.cu
index 616cdaef..f7c57681 100644
--- a/test/unit/gemm/threadblock/epilogue_workspace.cu
+++ b/test/unit/gemm/threadblock/epilogue_workspace.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/threadblock/mma_multistage.cu b/test/unit/gemm/threadblock/mma_multistage.cu
index df489734..26730e54 100644
--- a/test/unit/gemm/threadblock/mma_multistage.cu
+++ b/test/unit/gemm/threadblock/mma_multistage.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/threadblock/mma_multistage_slicedk.cu b/test/unit/gemm/threadblock/mma_multistage_slicedk.cu
new file mode 100644
index 00000000..e14f3c87
--- /dev/null
+++ b/test/unit/gemm/threadblock/mma_multistage_slicedk.cu
@@ -0,0 +1,111 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Unit tests for CTA-level GEMM specifically for sliced-k kernels (SM_61 and SM_75)
+*/
+
+#include "mma_multistage_testbed_slicedk.h"
+
+#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Tensor Op GEMM for SM_80
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM80_gemm_threadblock_congruous_sliced, tensor_op_128x64x256_tb128x64x64_warp64x64x32_16x8x16) {
+
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  cutlass::gemm::GemmCoord problem_size(128, 64, 256);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 64>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+  int const Stages = 3;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementC, LayoutC, cutlass::arch::OpClassTensorOp,
+      Stages>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 4, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+TEST(SM80_gemm_threadblock_crosswise_sliced, tensor_op_128x64x256_tb128x64x64_warp64x64x32_16x8x16) {
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  cutlass::gemm::GemmCoord problem_size(128, 64, 256);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<128, 64, 64>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+  int const Stages = 3;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementC, LayoutC, cutlass::arch::OpClassTensorOp,
+      Stages>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 4, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+#endif
diff --git a/test/unit/gemm/threadblock/mma_multistage_sparse.cu b/test/unit/gemm/threadblock/mma_multistage_sparse.cu
index 14f37254..c66faa05 100644
--- a/test/unit/gemm/threadblock/mma_multistage_sparse.cu
+++ b/test/unit/gemm/threadblock/mma_multistage_sparse.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/threadblock/mma_multistage_sparse_testbed.h b/test/unit/gemm/threadblock/mma_multistage_sparse_testbed.h
index 8419d428..7ae54704 100644
--- a/test/unit/gemm/threadblock/mma_multistage_sparse_testbed.h
+++ b/test/unit/gemm/threadblock/mma_multistage_sparse_testbed.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -229,12 +235,47 @@ struct SparseTestbed {
         cutlass::make_Coord(m, k / Sparse / ElementsPerElementE));
   }
 
+  /// Returns true if the CUDA device is sufficient to execute the kernel.
+  bool sufficient() const {
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+
+    int smem_size = int(sizeof(typename Mma::SharedStorage));
+
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    result = cudaGetDeviceProperties(&properties, device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    if (properties.sharedMemPerMultiprocessor < smem_size) {
+      return false;
+    }
+
+    return true;
+  }
+
   /// Runs the test
   bool run(
       dim3 grid, dim3 block,
       cutlass::Distribution::Kind init_A = cutlass::Distribution::Uniform,
       cutlass::Distribution::Kind init_B = cutlass::Distribution::Uniform,
       cutlass::Distribution::Kind init_E = cutlass::Distribution::Uniform) {
+
+    // Waive the test
+    if (!sufficient()) {
+      return true;
+    }
+
     //
     // initialize device memory
     //
diff --git a/test/unit/gemm/threadblock/mma_multistage_testbed.h b/test/unit/gemm/threadblock/mma_multistage_testbed.h
index 31cf0395..069ba0ad 100644
--- a/test/unit/gemm/threadblock/mma_multistage_testbed.h
+++ b/test/unit/gemm/threadblock/mma_multistage_testbed.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/threadblock/mma_multistage_testbed_slicedk.h b/test/unit/gemm/threadblock/mma_multistage_testbed_slicedk.h
new file mode 100644
index 00000000..a9cf2c62
--- /dev/null
+++ b/test/unit/gemm/threadblock/mma_multistage_testbed_slicedk.h
@@ -0,0 +1,387 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Unit testbed for kernel-level GEMM
+*/
+
+#pragma once
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/core_io.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/transform/threadblock/predicated_tile_access_iterator.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/platform/platform.h"
+
+namespace test {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Mma>
+__global__ void kernel_multistage_mma(cutlass::gemm::GemmCoord problem_size,
+                           typename Mma::IteratorA::Params params_A,
+                           typename Mma::IteratorA::TensorRef ref_A,
+                           typename Mma::IteratorB::Params params_B,
+                           typename Mma::IteratorB::TensorRef ref_B,
+                           typename Mma::ElementC **ptr_C,
+                           typename Mma::LayoutC::Stride::Index ldc) {
+  // Shared storage needed by threadblock-scoped matrix multiply-accumulate
+
+  // Dynamic shared memory base pointer
+  extern __shared__ int GemmSharedStorageBase[];
+
+  // Declare pointer to dynamic shared memory.
+  typename Mma::SharedStorage *shared_storage =
+      reinterpret_cast<typename Mma::SharedStorage *>(GemmSharedStorageBase);
+
+  // Compute threadblock location
+  cutlass::gemm::GemmCoord tb_tile_offset = {int(blockIdx.x), int(blockIdx.y),
+                                             0};
+
+  cutlass::MatrixCoord tb_offset_A{tb_tile_offset.m() * Mma::Shape::kM,
+                                   tb_tile_offset.k()};
+
+  cutlass::MatrixCoord tb_offset_B{tb_tile_offset.k(),
+                                   tb_tile_offset.n() * Mma::Shape::kN};
+
+  // Compute position within threadblock
+  int tb_thread_id = threadIdx.y * blockDim.x + threadIdx.x;
+
+  // Construct iterators to A and B operands
+  typename Mma::IteratorA iterator_A(params_A, ref_A.data(),
+                                     {problem_size.m(), problem_size.k()},
+                                     tb_thread_id, tb_offset_A);
+
+  typename Mma::IteratorB iterator_B(params_B, ref_B.data(),
+                                     {problem_size.k(), problem_size.n()},
+                                     tb_thread_id, tb_offset_B);
+
+  int warp_id = __shfl_sync(0xffffffff, threadIdx.y, 0);
+  int lane_id = threadIdx.x;
+
+  int partitionsK_idx = warp_id / (Mma::WarpCount::kM * Mma::WarpCount::kN);
+
+  // Construct thread-scoped matrix multiply
+  Mma mma(*shared_storage, tb_thread_id, warp_id, threadIdx.x);
+
+  typename Mma::FragmentC accum;
+
+  accum.clear();
+
+  int gemm_k_iterations = (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+  // Compute threadblock-scoped matrix multiply-add
+  mma(gemm_k_iterations, accum, iterator_A, iterator_B, accum);
+
+  // Output results
+  typename Mma::Operator::IteratorC iterator_C({ptr_C[partitionsK_idx], ldc}, lane_id);
+
+  int warp_idx_mn = warp_id % (Mma::WarpCount::kM * Mma::WarpCount::kN);
+  iterator_C.add_tile_offset(
+      {(tb_tile_offset.m() * Mma::WarpCount::kM) +
+           (warp_idx_mn % Mma::WarpCount::kM),
+       (tb_tile_offset.n() * Mma::WarpCount::kN) +
+           (warp_idx_mn / Mma::WarpCount::kM)});
+
+  iterator_C.store(accum);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product
+template <
+    /// Threadblock-level matrix multiply-accumulate
+    typename MmaCore_>
+struct Testbed {
+  /// Threadblock-level GEMM implementation
+  using MmaCore = MmaCore_;
+  using ThreadblockShape = typename MmaCore::Shape;
+  using WarpShape = typename MmaCore::WarpShape;
+  using InstructionShape = typename MmaCore::InstructionShape;
+  using ElementA = typename MmaCore::ElementA;
+  using LayoutA = typename MmaCore::LayoutA;
+  using ElementB = typename MmaCore::ElementB;
+  using LayoutB = typename MmaCore::LayoutB;
+  using ElementC = typename MmaCore::ElementC;
+  using LayoutC = typename MmaCore::LayoutC;
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeA = cutlass::Array<ElementA, ThreadMapA::kElementsPerAccess>;
+  using AccessTypeB = cutlass::Array<ElementB, ThreadMapB::kElementsPerAccess>;
+  static int const Stages = MmaCore::kStages;
+  static cutlass::arch::CacheOperation::Kind const CacheOpA =
+      MmaCore::kCacheOpA;
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      MmaCore::kCacheOpB;
+
+  // Define iterators over tiles from the A operand
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, AccessTypeB>;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using Mma = cutlass::gemm::threadblock::MmaMultistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA, CacheOpA,
+      IteratorB, typename MmaCore::SmemIteratorB, CacheOpB, ElementC, LayoutC,
+      typename MmaCore::MmaPolicy, Stages>;
+
+  static int const kPartitionsK = MmaCore::MmaPolicy::kPartitionsK; 
+
+  //
+  // Data members
+  //
+
+  cutlass::HostTensor<ElementA, LayoutA> matrix_A;
+  cutlass::HostTensor<ElementB, LayoutB> matrix_B;
+  cutlass::HostTensor<ElementC, LayoutC> matrix_C_computed[kPartitionsK];
+  cutlass::HostTensor<ElementC, LayoutC> matrix_C_reference;
+  cutlass::HostTensor<ElementC*, cutlass::layout::PackedVectorLayout> matrix_C_pointers;
+
+  cutlass::gemm::GemmCoord problem_size;
+  float alpha, beta;
+
+  //
+  // Methods
+  //
+
+  /// Allocates workspace in device memory
+  Testbed(int m, int n, int k, float alpha_ = float(1), float beta_ = float(0))
+      : problem_size(m, n, k), alpha(alpha_), beta(beta_) {
+    matrix_A.reset(cutlass::make_Coord(m, k));
+    matrix_B.reset(cutlass::make_Coord(k, n));
+
+    CUTLASS_PRAGMA_UNROLL
+    for(int k = 0; k < kPartitionsK; k++)
+      matrix_C_computed[k].reset(cutlass::make_Coord(m, n));
+
+    matrix_C_reference.reset(cutlass::make_Coord(m, n), false);
+    matrix_C_pointers.reset(cutlass::Coord<1>(kPartitionsK));
+  }
+
+  /// Runs the test
+  bool run(
+      dim3 grid, dim3 block,
+      cutlass::Distribution::Kind init_A = cutlass::Distribution::Uniform,
+      cutlass::Distribution::Kind init_B = cutlass::Distribution::Uniform) {
+    //
+    // initialize device memory
+    //
+
+    if (init_A == cutlass::Distribution::Uniform) {
+
+      int scope_max = 8;
+      int scope_min = -8;
+
+      if (cutlass::sizeof_bits<ElementA>::value == 4) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (cutlass::sizeof_bits<ElementA>::value == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      }
+
+      uint64_t seed = 7;
+      cutlass::reference::host::TensorFillRandomUniform(
+          matrix_A.host_view(), seed, scope_max, scope_min, 0);
+    } else if (init_A == cutlass::Distribution::Sequential) {
+      cutlass::reference::host::BlockFillSequential(matrix_A.host_data(),
+                                                    matrix_A.capacity());
+    } else if (init_A == cutlass::Distribution::Identity) {
+      cutlass::reference::host::TensorFillIdentity(matrix_A.host_view());
+    } else {
+      // TODO: Implement the rest
+      return false;
+    }
+
+    if (init_B == cutlass::Distribution::Uniform) {
+
+      int scope_max = 8;
+      int scope_min = -8;
+
+      if (cutlass::sizeof_bits<ElementB>::value == 4) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (cutlass::sizeof_bits<ElementB>::value == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      }
+
+      uint64_t seed = 7;
+      cutlass::reference::host::TensorFillRandomUniform(
+          matrix_B.host_view(), seed + 16, scope_max, scope_min, 0);
+    } else if (init_B == cutlass::Distribution::Sequential) {
+      cutlass::reference::host::BlockFillSequential(matrix_B.host_data(),
+                                                    matrix_B.capacity());
+    } else if (init_B == cutlass::Distribution::Identity) {
+      cutlass::reference::host::TensorFillIdentity(matrix_B.host_view());
+    } else {
+      // TODO: Implement the rest
+      return false;
+    }
+
+    CUTLASS_PRAGMA_UNROLL
+    for(int k = 0; k < kPartitionsK; k++)
+      cutlass::reference::host::TensorFill(matrix_C_computed[k].host_view());
+
+    cutlass::reference::host::TensorFill(matrix_C_reference.host_view());
+
+    matrix_A.sync_device();
+    matrix_B.sync_device();
+
+    CUTLASS_PRAGMA_UNROLL
+    for(int k = 0; k < kPartitionsK; k++)
+      matrix_C_computed[k].sync_device();
+
+    typename IteratorA::Params params_A(matrix_A.layout());
+    typename IteratorB::Params params_B(matrix_B.layout());
+
+    CUTLASS_PRAGMA_UNROLL
+    for(int k = 0; k < kPartitionsK; k++)
+      matrix_C_pointers.at(cutlass::Coord<1>(k)) = matrix_C_computed[k].device_data();
+
+    matrix_C_pointers.sync_device();
+
+    cudaError_t result;
+
+    int smem_size = int(sizeof(typename Mma::SharedStorage));
+    if (smem_size >= (48 << 10)) {
+      result = cudaFuncSetAttribute(
+          test::gemm::threadblock::kernel_multistage_mma<Mma>,
+          cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
+
+      EXPECT_EQ(result, cudaSuccess)
+          << " cudaFuncSetAttribute "
+             "cudaFuncAttributeMaxDynamicSharedMemorySize error: "
+          << cudaGetErrorString(result);
+
+      result = cudaFuncSetAttribute(
+          test::gemm::threadblock::kernel_multistage_mma<Mma>,
+          cudaFuncAttributePreferredSharedMemoryCarveout, 100);
+
+      EXPECT_EQ(result, cudaSuccess)
+          << " cudaFuncSetAttribute "
+             "cudaFuncAttributePreferredSharedMemoryCarveout error: "
+          << cudaGetErrorString(result);
+    }
+
+    test::gemm::threadblock::kernel_multistage_mma<Mma><<<grid, block, smem_size, 0>>>(
+        problem_size, params_A, matrix_A.device_ref(), params_B,
+        matrix_B.device_ref(), matrix_C_pointers.device_data(),
+        matrix_C_computed[0].layout().stride(0));
+
+    //
+    // Check error code
+    //
+
+    result = cudaDeviceSynchronize();
+    EXPECT_EQ(result, cudaSuccess)
+        << " kernel error: " << cudaGetErrorString(result);
+
+    CUTLASS_PRAGMA_UNROLL
+    for(int k = 0; k < kPartitionsK; k++)
+      matrix_C_computed[k].sync_host();
+
+    // TODO: this is temporary. it will be removed after slicing can de
+    // reduction
+    //
+    // Reduce matrix_C_computed
+    //
+    CUTLASS_PRAGMA_UNROLL
+    for(int k = 1; k < kPartitionsK; k++) {
+      CUTLASS_PRAGMA_UNROLL
+      for(int m = 0; m < matrix_C_computed[0].extent().row(); m++){
+        CUTLASS_PRAGMA_UNROLL
+        for(int n = 0; n < matrix_C_computed[0].extent().column(); n++){
+          matrix_C_computed[0].at({m, n}) += matrix_C_computed[k].at({m, n});
+        }
+      }
+    }
+
+    cutlass::reference::host::Gemm<ElementA, LayoutA, ElementB, LayoutB,
+                                   ElementC, LayoutC, ElementC, ElementC,
+                                   typename MmaCore::Operator>
+        reference_gemm;
+
+    reference_gemm(
+        problem_size, ElementC(alpha), matrix_A.host_view(),
+        matrix_B.host_view(), ElementC(beta), matrix_C_reference.host_view());
+
+    bool passed = cutlass::reference::host::TensorEquals(
+        matrix_C_computed[0].host_view(), matrix_C_reference.host_view());
+
+    EXPECT_TRUE(passed);
+
+    if (!passed) {
+      std::ofstream output("mma_multistage_testbed_errors.txt");
+
+      output
+        << "A:\n" << matrix_A.host_view() << "\n"
+        << "B:\n" << matrix_B.host_view() << "\n"
+        << "Reference:\n"
+        << matrix_C_reference.host_view() << "\n"
+        << "Computed:\n"
+        << matrix_C_computed[0].host_view() << "\n";
+    }
+
+    return passed;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace test
diff --git a/test/unit/gemm/threadblock/mma_pipelined_simt.cu b/test/unit/gemm/threadblock/mma_pipelined_simt.cu
index dffdf84d..c2cb5408 100644
--- a/test/unit/gemm/threadblock/mma_pipelined_simt.cu
+++ b/test/unit/gemm/threadblock/mma_pipelined_simt.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/threadblock/mma_pipelined_slicedk.cu b/test/unit/gemm/threadblock/mma_pipelined_slicedk.cu
new file mode 100644
index 00000000..455cf21e
--- /dev/null
+++ b/test/unit/gemm/threadblock/mma_pipelined_slicedk.cu
@@ -0,0 +1,186 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Unit tests for CTA-level GEMM specifically for sliced-k kernels (SM_61 and SM_75)
+*/
+
+#include "mma_pipelined_testbed_slicedk.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// igemm_NT DP4A
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM61_igemm_sliced_k, igemm_int8_nt_32x32x128_32x32x4) {
+    using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+        cutlass::gemm::GemmShape<32, 32, 128>,   // ThreadblockShape,
+        cutlass::gemm::GemmShape<32, 32, 32>,    // WarpShape,
+        cutlass::gemm::GemmShape<1, 1, 4>,      // InstructionShape,
+        int8_t,                                 // ElementA,
+        cutlass::layout::ColumnMajor,           // LayoutA,
+        int8_t,                                 // ElementB,
+        cutlass::layout::RowMajor,              // LayoutB,
+        int,                                    // ElementC,
+        cutlass::layout::RowMajor,              // LayoutC,
+        cutlass::arch::OpClassSimt,             // OpClass
+        2>;                                     // Stages,
+
+    cutlass::gemm::GemmCoord problem_size(32, 32, 128);
+    float alpha = 1.f;
+    float beta = 0.0f;
+    dim3 grid(1, 1);
+    dim3 block(32, 4, 1);
+    test::gemm::threadblock::Testbed<MmaCore>(
+        problem_size.m(), problem_size.n(), problem_size.k(), alpha, beta)
+        .run(grid, block, cutlass::Distribution::Uniform, cutlass::Distribution::Uniform);
+}
+
+TEST(SM61_igemm_sliced_k_big, igemm_int8_nt_32x32x128_32x32x4_bigk) {
+    using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+        cutlass::gemm::GemmShape<32, 32, 128>,   // ThreadblockShape,
+        cutlass::gemm::GemmShape<32, 32, 32>,    // WarpShape,
+        cutlass::gemm::GemmShape<1, 1, 4>,      // InstructionShape,
+        int8_t,                                 // ElementA,
+        cutlass::layout::ColumnMajor,           // LayoutA,
+        int8_t,                                 // ElementB,
+        cutlass::layout::RowMajor,              // LayoutB,
+        int,                                    // ElementC,
+        cutlass::layout::RowMajor,              // LayoutC,
+        cutlass::arch::OpClassSimt,             // OpClass
+        2>;                                     // Stages,
+
+    cutlass::gemm::GemmCoord problem_size(32, 32, 1024);
+    float alpha = 1.f;
+    float beta = 0.0f;
+    dim3 grid(1, 1);
+    dim3 block(32, 4, 1);
+    test::gemm::threadblock::Testbed<MmaCore>(
+        problem_size.m(), problem_size.n(), problem_size.k(), alpha, beta)
+        .run(grid, block, cutlass::Distribution::Uniform, cutlass::Distribution::Uniform);
+}
+
+
+TEST(SM61_igemm_sliced_k, igemm_int8_nt_32x64x128_32x32x4) {
+    using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+        cutlass::gemm::GemmShape<32, 64, 128>,   // ThreadblockShape,
+        cutlass::gemm::GemmShape<32, 32, 64>,    // WarpShape,
+        cutlass::gemm::GemmShape<1, 1, 4>,      // InstructionShape,
+        int8_t,                                 // ElementA,
+        cutlass::layout::ColumnMajor,           // LayoutA,
+        int8_t,                                 // ElementB,
+        cutlass::layout::RowMajor,              // LayoutB,
+        int,                                    // ElementC,
+        cutlass::layout::RowMajor,              // LayoutC,
+        cutlass::arch::OpClassSimt,             // OpClass
+        2>;                                     // Stages,
+
+    cutlass::gemm::GemmCoord problem_size(32, 64, 256);
+    float alpha = 1.f;
+    float beta = 0.0f;
+    dim3 grid(1, 1);
+    dim3 block(32, 4, 1);
+    test::gemm::threadblock::Testbed<MmaCore>(
+        problem_size.m(), problem_size.n(), problem_size.k(), alpha, beta)
+        .run(grid, block, cutlass::Distribution::Uniform, cutlass::Distribution::Uniform);
+}
+
+#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Tensor Op GEMM for SM_75
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM75_gemm_threadblock_congruous_sliced, tensor_op_64x64x256_tb64x64x64_warp64x32x32_16x8x8) {
+
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::RowMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  cutlass::gemm::GemmCoord problem_size(64, 64, 256);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementC, LayoutC, cutlass::arch::OpClassTensorOp, 2,
+      cutlass::arch::OpMultiplyAdd>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 4, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+TEST(SM75_gemm_threadblock_crosswise_sliced, tensor_op_64x64x256_tb64x64x64_warp64x32x32_16x8x8) {
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  cutlass::gemm::GemmCoord problem_size(64, 64, 256);
+
+  using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 64>;
+  using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+
+  float alpha = 1.f;
+  float beta = 0.0f;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementC, LayoutC, cutlass::arch::OpClassTensorOp, 2,
+      cutlass::arch::OpMultiplyAdd>;
+
+  dim3 grid(1, 1);
+  dim3 block(32, 4, 1);
+
+  test::gemm::threadblock::Testbed<MmaCore>(problem_size.m(), problem_size.n(),
+                                            problem_size.k(), alpha, beta)
+      .run(grid, block);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+#endif
diff --git a/test/unit/gemm/threadblock/mma_pipelined_sm70.cu b/test/unit/gemm/threadblock/mma_pipelined_sm70.cu
index d3d5c19d..9108d335 100644
--- a/test/unit/gemm/threadblock/mma_pipelined_sm70.cu
+++ b/test/unit/gemm/threadblock/mma_pipelined_sm70.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/threadblock/mma_pipelined_sm75.cu b/test/unit/gemm/threadblock/mma_pipelined_sm75.cu
index af6f5534..5cd20fd4 100644
--- a/test/unit/gemm/threadblock/mma_pipelined_sm75.cu
+++ b/test/unit/gemm/threadblock/mma_pipelined_sm75.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/threadblock/mma_pipelined_sm80.cu b/test/unit/gemm/threadblock/mma_pipelined_sm80.cu
index 592e8fa3..1a80c699 100644
--- a/test/unit/gemm/threadblock/mma_pipelined_sm80.cu
+++ b/test/unit/gemm/threadblock/mma_pipelined_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/threadblock/mma_pipelined_testbed.h b/test/unit/gemm/threadblock/mma_pipelined_testbed.h
index c157a88a..bdc0d873 100644
--- a/test/unit/gemm/threadblock/mma_pipelined_testbed.h
+++ b/test/unit/gemm/threadblock/mma_pipelined_testbed.h
@@ -1,27 +1,31 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
- *modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice,
- *this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *notice, this list of conditions and the following disclaimer in the
- *documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its
- *contributors may be used to endorse or promote products derived from this
- *software without specific prior written permission.
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- *DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY DIRECT,
- *INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- *DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- *OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- *NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
- *EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 /*! \file
diff --git a/test/unit/gemm/threadblock/mma_pipelined_testbed_slicedk.h b/test/unit/gemm/threadblock/mma_pipelined_testbed_slicedk.h
new file mode 100644
index 00000000..d0e0e05a
--- /dev/null
+++ b/test/unit/gemm/threadblock/mma_pipelined_testbed_slicedk.h
@@ -0,0 +1,370 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Unit testbed for kernel-level GEMM
+*/
+
+#pragma once
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/core_io.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/platform/platform.h"
+
+namespace test {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Mma>
+__global__ void kernel_mma(cutlass::gemm::GemmCoord problem_size,
+                           typename Mma::IteratorA::Params params_A,
+                           typename Mma::IteratorA::TensorRef ref_A,
+                           typename Mma::IteratorB::Params params_B,
+                           typename Mma::IteratorB::TensorRef ref_B,
+                           typename Mma::ElementC **ptr_C,
+                           typename Mma::LayoutC::Stride::Index ldc) {
+  // Shared storage needed by threadblock-scoped matrix multiply-accumulate
+  __shared__ typename Mma::SharedStorage shared_storage;
+
+  // Compute threadblock location
+  cutlass::gemm::GemmCoord tb_tile_offset = {int(blockIdx.x), int(blockIdx.y),
+                                             0};
+
+  cutlass::MatrixCoord tb_offset_A{tb_tile_offset.m() * Mma::Shape::kM,
+                                   tb_tile_offset.k()};
+
+  cutlass::MatrixCoord tb_offset_B{tb_tile_offset.k(),
+                                   tb_tile_offset.n() * Mma::Shape::kN};
+
+  // Compute position within threadblock
+  int tb_thread_id = threadIdx.y * blockDim.x + threadIdx.x;
+
+  // Construct iterators to A and B operands
+  typename Mma::IteratorA iterator_A(params_A, ref_A.data(),
+                                     {problem_size.m(), problem_size.k()},
+                                     tb_thread_id, tb_offset_A);
+
+  typename Mma::IteratorB iterator_B(params_B, ref_B.data(),
+                                     {problem_size.k(), problem_size.n()},
+                                     tb_thread_id, tb_offset_B);
+
+  int warp_id = threadIdx.y;
+  int lane_id = threadIdx.x;
+
+  int partitionsK_idx = warp_id / (Mma::WarpCount::kM * Mma::WarpCount::kN);
+
+  // Construct thread-scoped matrix multiply
+  Mma mma(shared_storage, tb_thread_id, warp_id, threadIdx.x);
+
+  typename Mma::FragmentC accum;
+
+  accum.clear();
+
+  int gemm_k_iterations = (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+  // Compute threadblock-scoped matrix multiply-add
+  mma(gemm_k_iterations, accum, iterator_A, iterator_B, accum);
+
+  // Output results
+  typename Mma::Operator::IteratorC iterator_C({ptr_C[partitionsK_idx], ldc}, lane_id);
+
+
+  int warp_idx_mn = warp_id % (Mma::WarpCount::kM * Mma::WarpCount::kN);
+  iterator_C.add_tile_offset(
+      {(tb_tile_offset.m() * Mma::WarpCount::kM) +
+           (warp_idx_mn % Mma::WarpCount::kM),
+       (tb_tile_offset.n() * Mma::WarpCount::kN) +
+           (warp_idx_mn / Mma::WarpCount::kM)});
+
+  iterator_C.store(accum);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product
+template <
+    /// Threadblock-level matrix multiply-accumulate
+    typename MmaCore_>
+struct Testbed {
+  /// Threadblock-level GEMM implementation
+  using MmaCore = MmaCore_;
+  using ThreadblockShape = typename MmaCore::Shape;
+  using WarpShape = typename MmaCore::WarpShape;
+  using InstructionShape = typename MmaCore::InstructionShape;
+  using ElementA = typename MmaCore::ElementA;
+  using LayoutA = typename MmaCore::LayoutA;
+  using ElementB = typename MmaCore::ElementB;
+  using LayoutB = typename MmaCore::LayoutB;
+  using ElementC = typename MmaCore::ElementC;
+  using LayoutC = typename MmaCore::LayoutC;
+
+  // Define iterators over tiles from the A operand
+  static const bool use_idp4a = cutlass::platform::is_same<ElementA, int8_t>::value && 
+                                cutlass::platform::is_same<ElementB, int8_t>::value && 
+                                cutlass::platform::is_same<typename MmaCore::OperatorClass, cutlass::arch::OpClassSimt>::value;
+
+  static const bool transposeA =  cutlass::platform::is_same< LayoutA, cutlass::layout::ColumnMajor >::value;
+  static const bool transposeB =  cutlass::platform::is_same< LayoutB, cutlass::layout::RowMajor >::value;
+
+  using IteratorA = typename cutlass::platform::conditional< use_idp4a,
+      cutlass::transform::threadblock::PredicatedTileIterator2dThreadTile<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, typename MmaCore::IteratorThreadMapA, transposeA> ,
+        
+      cutlass::transform::threadblock::PredicatedTileIterator<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, typename MmaCore::IteratorThreadMapA>
+      >::type;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB = typename cutlass::platform::conditional< use_idp4a,
+      cutlass::transform::threadblock::PredicatedTileIterator2dThreadTile<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, typename MmaCore::IteratorThreadMapB, transposeB> ,
+
+      cutlass::transform::threadblock::PredicatedTileIterator<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, typename MmaCore::IteratorThreadMapB>
+      >::type;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using Mma = cutlass::gemm::threadblock::MmaPipelined<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      IteratorB, typename MmaCore::SmemIteratorB, ElementC, LayoutC,
+      typename MmaCore::MmaPolicy>;
+
+  static int const kPartitionsK = MmaCore::MmaPolicy::kPartitionsK; 
+
+  //
+  // Data members
+  //
+
+  cutlass::HostTensor<ElementA, LayoutA> matrix_A;
+  cutlass::HostTensor<ElementB, LayoutB> matrix_B;
+  cutlass::HostTensor<ElementC, LayoutC> matrix_C_computed[kPartitionsK];
+  cutlass::HostTensor<ElementC, LayoutC> matrix_C_reference;
+  cutlass::HostTensor<ElementC*, cutlass::layout::PackedVectorLayout> matrix_C_pointers;
+
+  cutlass::gemm::GemmCoord problem_size;
+  float alpha, beta;
+
+  //
+  // Methods
+  //
+
+  /// Allocates workspace in device memory
+  Testbed(int m, int n, int k, float alpha_, float beta_)
+      : problem_size(m, n, k), alpha(alpha_), beta(beta_) {
+    matrix_A.reset(cutlass::make_Coord(m, k));
+    matrix_B.reset(cutlass::make_Coord(k, n));
+
+    CUTLASS_PRAGMA_UNROLL
+    for(int k = 0; k < kPartitionsK; k++)
+      matrix_C_computed[k].reset(cutlass::make_Coord(m, n));
+
+    matrix_C_reference.reset(cutlass::make_Coord(m, n), false);
+    matrix_C_pointers.reset(cutlass::Coord<1>(kPartitionsK));
+  }
+
+  /// Runs the test
+  bool run(
+      dim3 grid, dim3 block,
+      cutlass::Distribution::Kind init_A = cutlass::Distribution::Uniform,
+      cutlass::Distribution::Kind init_B = cutlass::Distribution::Uniform) {
+    //
+    // initialize device memory
+    //
+
+    if (init_A == cutlass::Distribution::Uniform) {
+
+      int scope_max = 8;
+      int scope_min = -8;
+
+      if (cutlass::sizeof_bits<ElementA>::value == 4) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (cutlass::sizeof_bits<ElementA>::value == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      }
+
+      uint64_t seed = 7;
+      cutlass::reference::host::TensorFillRandomUniform(
+          matrix_A.host_view(), seed, scope_max, scope_min, 0);
+    } else if (init_A == cutlass::Distribution::Sequential) {
+      cutlass::reference::host::BlockFillSequential(matrix_A.host_data(),
+                                                    matrix_A.capacity());
+    } else if (init_A == cutlass::Distribution::Identity) {
+      cutlass::reference::host::TensorFillIdentity(matrix_A.host_view());
+    } else {
+      // TODO: Implement the rest
+      return false;
+    }
+
+    if (init_B == cutlass::Distribution::Uniform) {
+
+      int scope_max = 8;
+      int scope_min = -8;
+
+      if (cutlass::sizeof_bits<ElementB>::value == 4) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (cutlass::sizeof_bits<ElementB>::value == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      }
+
+      uint64_t seed = 7;
+      cutlass::reference::host::TensorFillRandomUniform(
+          matrix_B.host_view(), seed + 16, scope_max, scope_min, 0);
+    } else if (init_B == cutlass::Distribution::Sequential) {
+      cutlass::reference::host::BlockFillSequential(matrix_B.host_data(),
+                                                    matrix_B.capacity());
+    } else if (init_B == cutlass::Distribution::Identity) {
+      cutlass::reference::host::TensorFillIdentity(matrix_B.host_view());
+    } else {
+      // TODO: Implement the rest
+      return false;
+    }
+
+    CUTLASS_PRAGMA_UNROLL
+    for(int k = 0; k < kPartitionsK; k++)
+      cutlass::reference::host::TensorFill(matrix_C_computed[k].host_view());
+
+    cutlass::reference::host::TensorFill(matrix_C_reference.host_view());
+
+    matrix_A.sync_device();
+    matrix_B.sync_device();
+
+    CUTLASS_PRAGMA_UNROLL
+    for(int k = 0; k < kPartitionsK; k++)
+      matrix_C_computed[k].sync_device();
+
+    typename IteratorA::Params params_A(matrix_A.layout());
+    typename IteratorB::Params params_B(matrix_B.layout());
+
+    CUTLASS_PRAGMA_UNROLL
+    for(int k = 0; k < kPartitionsK; k++)
+      matrix_C_pointers.at(cutlass::Coord<1>(k)) = matrix_C_computed[k].device_data();
+
+    matrix_C_pointers.sync_device();
+
+    test::gemm::threadblock::kernel_mma<Mma><<<grid, block>>>(
+        problem_size, params_A, matrix_A.device_ref(), params_B,
+        matrix_B.device_ref(), matrix_C_pointers.device_data(),
+        matrix_C_computed[0].layout().stride(0));
+
+    //
+    // Check error code
+    //
+
+    cudaError_t result = cudaDeviceSynchronize();
+    EXPECT_EQ(result, cudaSuccess)
+        << " kernel error: " << cudaGetErrorString(result);
+
+    CUTLASS_PRAGMA_UNROLL
+    for(int k = 0; k < kPartitionsK; k++)
+      matrix_C_computed[k].sync_host();
+
+    // TODO: this is temporary. it will be removed after slicing can de
+    // reduction
+    //
+    // Reduce matrix_C_computed
+    //
+    CUTLASS_PRAGMA_UNROLL
+    for(int k = 1; k < kPartitionsK; k++) {
+      CUTLASS_PRAGMA_UNROLL
+      for(int m = 0; m < matrix_C_computed[0].extent().row(); m++){
+        CUTLASS_PRAGMA_UNROLL
+        for(int n = 0; n < matrix_C_computed[0].extent().column(); n++){
+          matrix_C_computed[0].at({m, n}) += matrix_C_computed[k].at({m, n});
+        }
+      }
+    }
+
+    cutlass::reference::host::Gemm<ElementA, LayoutA, ElementB, LayoutB,
+                                   ElementC, LayoutC, ElementC, ElementC,
+                                   typename MmaCore::Operator>
+        reference_gemm;
+
+    reference_gemm(
+        problem_size, ElementC(alpha), matrix_A.host_view(),
+        matrix_B.host_view(), ElementC(beta), matrix_C_reference.host_view());
+
+    bool passed = cutlass::reference::host::TensorEquals(
+        matrix_C_computed[0].host_view(), matrix_C_reference.host_view());
+
+    EXPECT_TRUE(passed);
+
+    if (!passed) {
+      std::ofstream output("mma_pipelined_testbed_errors.txt");
+
+      output
+        << "A:\n" << matrix_A.host_view() << "\n"
+        << "B:\n" << matrix_B.host_view() << "\n"
+        << "Reference:\n"
+        << matrix_C_reference.host_view() << "\n"
+        << "Computed:\n"
+        << matrix_C_computed[0].host_view() << "\n";
+    }
+
+    return passed;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace test
diff --git a/test/unit/gemm/threadblock/mma_pipelined_wmma_sm70.cu b/test/unit/gemm/threadblock/mma_pipelined_wmma_sm70.cu
index 06475a09..aec8b058 100644
--- a/test/unit/gemm/threadblock/mma_pipelined_wmma_sm70.cu
+++ b/test/unit/gemm/threadblock/mma_pipelined_wmma_sm70.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/threadblock/mma_pipelined_wmma_sm75.cu b/test/unit/gemm/threadblock/mma_pipelined_wmma_sm75.cu
index d186ac74..9a8602b1 100644
--- a/test/unit/gemm/threadblock/mma_pipelined_wmma_sm75.cu
+++ b/test/unit/gemm/threadblock/mma_pipelined_wmma_sm75.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/threadblock/mma_planar_complex_sm80.cu b/test/unit/gemm/threadblock/mma_planar_complex_sm80.cu
index 90271b4b..08754ebf 100644
--- a/test/unit/gemm/threadblock/mma_planar_complex_sm80.cu
+++ b/test/unit/gemm/threadblock/mma_planar_complex_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/threadblock/mma_planar_complex_testbed.h b/test/unit/gemm/threadblock/mma_planar_complex_testbed.h
index baa3dc8f..59539062 100644
--- a/test/unit/gemm/threadblock/mma_planar_complex_testbed.h
+++ b/test/unit/gemm/threadblock/mma_planar_complex_testbed.h
@@ -1,27 +1,31 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
- *modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice,
- *this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *notice, this list of conditions and the following disclaimer in the
- *documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its
- *contributors may be used to endorse or promote products derived from this
- *software without specific prior written permission.
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- *DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY DIRECT,
- *INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- *DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- *OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- *NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
- *EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 /*! \file
diff --git a/test/unit/gemm/threadblock/mma_singlestage_wmma_sm70.cu b/test/unit/gemm/threadblock/mma_singlestage_wmma_sm70.cu
index 82235a9b..2e516780 100644
--- a/test/unit/gemm/threadblock/mma_singlestage_wmma_sm70.cu
+++ b/test/unit/gemm/threadblock/mma_singlestage_wmma_sm70.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/threadblock/mma_singlestage_wmma_sm75.cu b/test/unit/gemm/threadblock/mma_singlestage_wmma_sm75.cu
index 485b5d62..7844b4d3 100644
--- a/test/unit/gemm/threadblock/mma_singlestage_wmma_sm75.cu
+++ b/test/unit/gemm/threadblock/mma_singlestage_wmma_sm75.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/warp/CMakeLists.txt b/test/unit/gemm/warp/CMakeLists.txt
index 64f614d2..acef876d 100644
--- a/test/unit/gemm/warp/CMakeLists.txt
+++ b/test/unit/gemm/warp/CMakeLists.txt
@@ -1,23 +1,29 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 cutlass_test_unit_add_executable(
diff --git a/test/unit/gemm/warp/gemm_complex_sm80.cu b/test/unit/gemm/warp/gemm_complex_sm80.cu
index 06f1c019..d0312345 100644
--- a/test/unit/gemm/warp/gemm_complex_sm80.cu
+++ b/test/unit/gemm/warp/gemm_complex_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/warp/gemm_gaussian_complex_sm80.cu b/test/unit/gemm/warp/gemm_gaussian_complex_sm80.cu
index 7ccfc0b7..359d8a09 100644
--- a/test/unit/gemm/warp/gemm_gaussian_complex_sm80.cu
+++ b/test/unit/gemm/warp/gemm_gaussian_complex_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/warp/gemm_sm50.cu b/test/unit/gemm/warp/gemm_sm50.cu
index e00fc851..3a245aaf 100644
--- a/test/unit/gemm/warp/gemm_sm50.cu
+++ b/test/unit/gemm/warp/gemm_sm50.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/warp/gemm_sm60.cu b/test/unit/gemm/warp/gemm_sm60.cu
index 13ebf242..645d17fb 100644
--- a/test/unit/gemm/warp/gemm_sm60.cu
+++ b/test/unit/gemm/warp/gemm_sm60.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/warp/gemm_sm61.cu b/test/unit/gemm/warp/gemm_sm61.cu
index 212d8f67..b61f2a76 100644
--- a/test/unit/gemm/warp/gemm_sm61.cu
+++ b/test/unit/gemm/warp/gemm_sm61.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/warp/gemm_sm70.cu b/test/unit/gemm/warp/gemm_sm70.cu
index c288240f..60f1b863 100644
--- a/test/unit/gemm/warp/gemm_sm70.cu
+++ b/test/unit/gemm/warp/gemm_sm70.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/warp/gemm_sm75.cu b/test/unit/gemm/warp/gemm_sm75.cu
index 6ccbfd6c..c7f5864f 100644
--- a/test/unit/gemm/warp/gemm_sm75.cu
+++ b/test/unit/gemm/warp/gemm_sm75.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/warp/gemm_sm80.cu b/test/unit/gemm/warp/gemm_sm80.cu
index 956a1d70..9d693930 100644
--- a/test/unit/gemm/warp/gemm_sm80.cu
+++ b/test/unit/gemm/warp/gemm_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/warp/gemm_sparse_sm80.cu b/test/unit/gemm/warp/gemm_sparse_sm80.cu
index 7578224a..8f9c9292 100644
--- a/test/unit/gemm/warp/gemm_sparse_sm80.cu
+++ b/test/unit/gemm/warp/gemm_sparse_sm80.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/warp/testbed.h b/test/unit/gemm/warp/testbed.h
index ef0637a5..da53fb5c 100644
--- a/test/unit/gemm/warp/testbed.h
+++ b/test/unit/gemm/warp/testbed.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/warp/wmma_sm70.cu b/test/unit/gemm/warp/wmma_sm70.cu
index ca01c0ef..a2a04b65 100644
--- a/test/unit/gemm/warp/wmma_sm70.cu
+++ b/test/unit/gemm/warp/wmma_sm70.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/warp/wmma_sm72.cu b/test/unit/gemm/warp/wmma_sm72.cu
index 50ec272c..317d29d3 100644
--- a/test/unit/gemm/warp/wmma_sm72.cu
+++ b/test/unit/gemm/warp/wmma_sm72.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/gemm/warp/wmma_sm75.cu b/test/unit/gemm/warp/wmma_sm75.cu
index 5152e59c..f1c5708f 100644
--- a/test/unit/gemm/warp/wmma_sm75.cu
+++ b/test/unit/gemm/warp/wmma_sm75.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/layout/CMakeLists.txt b/test/unit/layout/CMakeLists.txt
index 3530f12f..bbe24abe 100644
--- a/test/unit/layout/CMakeLists.txt
+++ b/test/unit/layout/CMakeLists.txt
@@ -1,23 +1,29 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 cutlass_test_unit_add_executable(
diff --git a/test/unit/layout/matrix.cu b/test/unit/layout/matrix.cu
index 2cf80c1b..4ee88089 100644
--- a/test/unit/layout/matrix.cu
+++ b/test/unit/layout/matrix.cu
@@ -1,25 +1,31 @@
 /***************************************************************************************************
-* Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
-*
-* Redistribution and use in source and binary forms, with or without modification, are permitted
-* provided that the following conditions are met:
-*     * Redistributions of source code must retain the above copyright notice, this list of
-*       conditions and the following disclaimer.
-*     * Redistributions in binary form must reproduce the above copyright notice, this list of
-*       conditions and the following disclaimer in the documentation and/or other materials
-*       provided with the distribution.
-*     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-*       to endorse or promote products derived from this software without specific prior written
-*       permission.
-*
-* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
 /*! \file
diff --git a/test/unit/layout/tensor.cu b/test/unit/layout/tensor.cu
index 43b7a0d5..9b3f6b83 100644
--- a/test/unit/layout/tensor.cu
+++ b/test/unit/layout/tensor.cu
@@ -1,25 +1,31 @@
 /***************************************************************************************************
-* Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
-*
-* Redistribution and use in source and binary forms, with or without modification, are permitted
-* provided that the following conditions are met:
-*     * Redistributions of source code must retain the above copyright notice, this list of
-*       conditions and the following disclaimer.
-*     * Redistributions in binary form must reproduce the above copyright notice, this list of
-*       conditions and the following disclaimer in the documentation and/or other materials
-*       provided with the distribution.
-*     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-*       to endorse or promote products derived from this software without specific prior written
-*       permission.
-*
-* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
 /*! \file
diff --git a/test/unit/layout/tensor_nhwc.cu b/test/unit/layout/tensor_nhwc.cu
index fae33df4..16b97b9d 100644
--- a/test/unit/layout/tensor_nhwc.cu
+++ b/test/unit/layout/tensor_nhwc.cu
@@ -1,25 +1,31 @@
 /***************************************************************************************************
-* Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
-*
-* Redistribution and use in source and binary forms, with or without modification, are permitted
-* provided that the following conditions are met:
-*     * Redistributions of source code must retain the above copyright notice, this list of
-*       conditions and the following disclaimer.
-*     * Redistributions in binary form must reproduce the above copyright notice, this list of
-*       conditions and the following disclaimer in the documentation and/or other materials
-*       provided with the distribution.
-*     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-*       to endorse or promote products derived from this software without specific prior written
-*       permission.
-*
-* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
 /*! \file
diff --git a/test/unit/nvrtc/CMakeLists.txt b/test/unit/nvrtc/CMakeLists.txt
index 4cacb7bc..10022ee5 100644
--- a/test/unit/nvrtc/CMakeLists.txt
+++ b/test/unit/nvrtc/CMakeLists.txt
@@ -1,23 +1,29 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 set(CUTLASS_NVRTC_HAS_CUDA_FP16 FALSE)
diff --git a/test/unit/nvrtc/cutlass/nvrtc/environment.h b/test/unit/nvrtc/cutlass/nvrtc/environment.h
index bbd3ce00..26bb2555 100644
--- a/test/unit/nvrtc/cutlass/nvrtc/environment.h
+++ b/test/unit/nvrtc/cutlass/nvrtc/environment.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/nvrtc/kernel/thread/testbed_kernel.h b/test/unit/nvrtc/kernel/thread/testbed_kernel.h
index 57e67bfd..d0998229 100644
--- a/test/unit/nvrtc/kernel/thread/testbed_kernel.h
+++ b/test/unit/nvrtc/kernel/thread/testbed_kernel.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/nvrtc/stdlib/stdint.h b/test/unit/nvrtc/stdlib/stdint.h
index 1d560c2c..2e7954fd 100644
--- a/test/unit/nvrtc/stdlib/stdint.h
+++ b/test/unit/nvrtc/stdlib/stdint.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/nvrtc/thread/CMakeLists.txt b/test/unit/nvrtc/thread/CMakeLists.txt
index e2cbb2ea..f65ed986 100644
--- a/test/unit/nvrtc/thread/CMakeLists.txt
+++ b/test/unit/nvrtc/thread/CMakeLists.txt
@@ -1,23 +1,29 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 cutlass_test_unit_add_executable(
diff --git a/test/unit/nvrtc/thread/gemm_nvrtc.cu b/test/unit/nvrtc/thread/gemm_nvrtc.cu
index 6aad7335..acbc66df 100644
--- a/test/unit/nvrtc/thread/gemm_nvrtc.cu
+++ b/test/unit/nvrtc/thread/gemm_nvrtc.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/nvrtc/thread/testbed.h b/test/unit/nvrtc/thread/testbed.h
index f1e68685..795c506e 100644
--- a/test/unit/nvrtc/thread/testbed.h
+++ b/test/unit/nvrtc/thread/testbed.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/reduction/CMakeLists.txt b/test/unit/reduction/CMakeLists.txt
index 3c4563fc..900c0087 100644
--- a/test/unit/reduction/CMakeLists.txt
+++ b/test/unit/reduction/CMakeLists.txt
@@ -1,23 +1,29 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 add_subdirectory(thread)
diff --git a/test/unit/reduction/device/CMakeLists.txt b/test/unit/reduction/device/CMakeLists.txt
index 43ebae28..e112f11a 100644
--- a/test/unit/reduction/device/CMakeLists.txt
+++ b/test/unit/reduction/device/CMakeLists.txt
@@ -1,23 +1,29 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 cutlass_test_unit_add_executable(
diff --git a/test/unit/reduction/device/tensor_reduce_contiguous.cu b/test/unit/reduction/device/tensor_reduce_contiguous.cu
index cf633cbc..d5d6eb7b 100644
--- a/test/unit/reduction/device/tensor_reduce_contiguous.cu
+++ b/test/unit/reduction/device/tensor_reduce_contiguous.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/reduction/device/tensor_reduce_strided.cu b/test/unit/reduction/device/tensor_reduce_strided.cu
index eb288867..06b06b79 100644
--- a/test/unit/reduction/device/tensor_reduce_strided.cu
+++ b/test/unit/reduction/device/tensor_reduce_strided.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/reduction/kernel/CMakeLists.txt b/test/unit/reduction/kernel/CMakeLists.txt
index 353e610f..c45d4f9b 100644
--- a/test/unit/reduction/kernel/CMakeLists.txt
+++ b/test/unit/reduction/kernel/CMakeLists.txt
@@ -1,23 +1,29 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 cutlass_test_unit_add_executable(
diff --git a/test/unit/reduction/kernel/reduce_splitk.cu b/test/unit/reduction/kernel/reduce_splitk.cu
index 53fe258f..de33aa83 100644
--- a/test/unit/reduction/kernel/reduce_splitk.cu
+++ b/test/unit/reduction/kernel/reduce_splitk.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/reduction/kernel/reduce_splitk_testbed.h b/test/unit/reduction/kernel/reduce_splitk_testbed.h
index 127856ee..e10b9ce8 100644
--- a/test/unit/reduction/kernel/reduce_splitk_testbed.h
+++ b/test/unit/reduction/kernel/reduce_splitk_testbed.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/reduction/thread/CMakeLists.txt b/test/unit/reduction/thread/CMakeLists.txt
index bbfb2df6..909ad5f6 100644
--- a/test/unit/reduction/thread/CMakeLists.txt
+++ b/test/unit/reduction/thread/CMakeLists.txt
@@ -1,23 +1,29 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 cutlass_test_unit_add_executable(
diff --git a/test/unit/reduction/thread/reduction_thread.cu b/test/unit/reduction/thread/reduction_thread.cu
index 196311c1..447b7ee7 100644
--- a/test/unit/reduction/thread/reduction_thread.cu
+++ b/test/unit/reduction/thread/reduction_thread.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/reduction/thread/testbed.h b/test/unit/reduction/thread/testbed.h
index 07881691..33413355 100644
--- a/test/unit/reduction/thread/testbed.h
+++ b/test/unit/reduction/thread/testbed.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/test_unit.cpp b/test/unit/test_unit.cpp
index 710acb4c..1d263eaf 100644
--- a/test/unit/test_unit.cpp
+++ b/test/unit/test_unit.cpp
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/transform/CMakeLists.txt b/test/unit/transform/CMakeLists.txt
index 4b4be839..09a7078c 100644
--- a/test/unit/transform/CMakeLists.txt
+++ b/test/unit/transform/CMakeLists.txt
@@ -1,23 +1,29 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 add_subdirectory(threadblock)
diff --git a/test/unit/transform/threadblock/CMakeLists.txt b/test/unit/transform/threadblock/CMakeLists.txt
index 43c8847c..84ab92bb 100644
--- a/test/unit/transform/threadblock/CMakeLists.txt
+++ b/test/unit/transform/threadblock/CMakeLists.txt
@@ -1,23 +1,29 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 cutlass_test_unit_add_executable(
diff --git a/test/unit/transform/threadblock/predicated_tile_iterator.cu b/test/unit/transform/threadblock/predicated_tile_iterator.cu
index 1ebf39b9..8157f2b3 100644
--- a/test/unit/transform/threadblock/predicated_tile_iterator.cu
+++ b/test/unit/transform/threadblock/predicated_tile_iterator.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/transform/threadblock/regular_tile_iterator_tensor_op.cu b/test/unit/transform/threadblock/regular_tile_iterator_tensor_op.cu
index c0d075e8..092c1c2d 100644
--- a/test/unit/transform/threadblock/regular_tile_iterator_tensor_op.cu
+++ b/test/unit/transform/threadblock/regular_tile_iterator_tensor_op.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/util/CMakeLists.txt b/test/unit/util/CMakeLists.txt
index ee41d08d..c93afc2c 100644
--- a/test/unit/util/CMakeLists.txt
+++ b/test/unit/util/CMakeLists.txt
@@ -1,23 +1,29 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 cutlass_test_unit_add_executable(
diff --git a/test/unit/util/cutlass_test_levels.cu b/test/unit/util/cutlass_test_levels.cu
index 9bf9faa3..ed6092a6 100644
--- a/test/unit/util/cutlass_test_levels.cu
+++ b/test/unit/util/cutlass_test_levels.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/test/unit/util/tensor_reduce.cu b/test/unit/util/tensor_reduce.cu
index 013d40ca..752d874e 100644
--- a/test/unit/util/tensor_reduce.cu
+++ b/test/unit/util/tensor_reduce.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
index 1ecd7d25..ee5cf222 100644
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt
@@ -1,23 +1,29 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 add_subdirectory(util)
diff --git a/tools/library/CMakeLists.txt b/tools/library/CMakeLists.txt
index 5ef2b939..c6c7d6fb 100644
--- a/tools/library/CMakeLists.txt
+++ b/tools/library/CMakeLists.txt
@@ -1,23 +1,29 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 include(GNUInstallDirs)
diff --git a/tools/library/include/cutlass/library/arch_mappings.h b/tools/library/include/cutlass/library/arch_mappings.h
index 060c584d..3638ca10 100644
--- a/tools/library/include/cutlass/library/arch_mappings.h
+++ b/tools/library/include/cutlass/library/arch_mappings.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/library/include/cutlass/library/handle.h b/tools/library/include/cutlass/library/handle.h
index 8a8a976a..01314879 100644
--- a/tools/library/include/cutlass/library/handle.h
+++ b/tools/library/include/cutlass/library/handle.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2022, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/library/include/cutlass/library/library.h b/tools/library/include/cutlass/library/library.h
index 37e2b89f..aa4a1364 100644
--- a/tools/library/include/cutlass/library/library.h
+++ b/tools/library/include/cutlass/library/library.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -51,6 +57,8 @@
 #include "cutlass/matrix_coord.h"
 #include "cutlass/tensor_coord.h"
 #include "cutlass/layout/tensor.h"
+#include "cutlass/blas3.h"
+
 #include "cutlass/gemm/gemm.h"
 #include "cutlass/conv/convolution.h"
 #include "cutlass/conv/conv2d_problem_size.h"
@@ -154,6 +162,10 @@ enum class Provider {
 /// Enumeration indicating the kind of operation
 enum class OperationKind {
   kGemm,
+  kRankK,
+  kRank2K,
+  kTrmm,
+  kSymm,
   kConv2d,              
   kConv3d,             
   kEqGemm,
@@ -210,12 +222,31 @@ enum class GemmKind {
   kUniversal,
   kPlanarComplex,
   kPlanarComplexArray,
+  kGrouped,
   kInvalid
 };
 
 /// Mode of Universal GEMM
 using GemmUniversalMode = cutlass::gemm::GemmUniversalMode;
 
+/// Enumeration indicating what kind of RankK update operation to perform
+enum class RankKKind {
+  kUniversal,
+  kInvalid
+};
+
+/// Enumeration indicating what kind of TRMM operation to perform
+enum class TrmmKind {
+  kUniversal,
+  kInvalid
+};
+
+/// Enumeration indicating what kind of SYMM/HEMM operation to perform
+enum class SymmKind {
+  kUniversal,
+  kInvalid
+};
+
 /// Enumeration indicating what kind of Conv2d operation to perform
 enum class ConvKind {
   kUnknown,
@@ -236,6 +267,8 @@ enum class IteratorAlgorithmID {
   kNone,
   kAnalytic,
   kOptimized,
+  kFixedChannels,
+  kFewChannels,
   kInvalid
 };
 
@@ -512,6 +545,204 @@ struct ReductionDescription : public OperationDescription {
   NumericTypeID element_epilogue;
 };
 
+/// Description of all Rank K update computations (SYRK, HERK, SYR2K, HER2K)
+struct RankKDescription : public OperationDescription {
+
+  /// Indicates which device template is used (universal or regular)
+  RankKKind rank_k_kind;
+
+  /// Number of rank update (rank k or rank 2k)
+  int num_ranks;
+  
+  /// Describes the A operand
+  TensorDescription A;
+
+  /// Describes the B operand (used only for SYR2K and HER2K)
+  TensorDescription B;
+
+  /// Describes the source and destination matrices
+  TensorDescription C;
+
+  /// Describes the fill mode for matrix C
+  FillMode fill_mode;
+
+  /// Describes the blas mode (symmetric/hermitian)
+  BlasMode blas_mode;
+
+  /// Describes the data type of the scalars passed to the epilogue
+  NumericTypeID element_epilogue;
+
+  /// Describes the structure of parallel reductions
+  SplitKMode split_k_mode;
+
+  /// Transformation on A operand
+  ComplexTransform transform_A;
+
+  /// Transformation on B operand
+  ComplexTransform transform_B;
+
+  //
+  // Methods
+  //
+
+  RankKDescription(
+    RankKKind rank_k_kind = RankKKind::kUniversal,
+    int num_ranks = 1,
+    TensorDescription const &A = TensorDescription(),
+    TensorDescription const &B = TensorDescription(),
+    TensorDescription const &C = TensorDescription(),
+    FillMode fill_mode = FillMode::kInvalid,
+    BlasMode blas_mode = BlasMode::kInvalid,
+    NumericTypeID element_epilogue = NumericTypeID::kInvalid,
+    SplitKMode split_k_mode = SplitKMode::kNone,
+    ComplexTransform transform_A = ComplexTransform::kNone,
+    ComplexTransform transform_B = ComplexTransform::kNone
+  ):
+    rank_k_kind(rank_k_kind),
+    num_ranks(num_ranks),
+    A(A),
+    B(B),
+    C(C),
+    fill_mode(fill_mode),
+    blas_mode(blas_mode),
+    element_epilogue(element_epilogue),
+    split_k_mode(split_k_mode),
+    transform_A(transform_A),
+    transform_B(transform_B) {} 
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Description of all TRMM computations
+struct TrmmDescription : public OperationDescription {
+
+  /// Indicates the kind of TRMM performed
+  TrmmKind trmm_kind;
+  
+  /// Describes the A operand
+  TensorDescription A;
+
+  /// Describes the side mode for matrix A
+  SideMode side_mode;
+
+  /// Describes the fill mode for matrix A
+  FillMode fill_mode;
+
+  /// Describes the diag type for matrix A
+  DiagType diag_type;
+
+  /// Describes the B operand
+  TensorDescription B;
+
+  /// Describes the source and destination matrices
+  TensorDescription D;
+
+  /// Describes the data type of the scalars passed to the epilogue
+  NumericTypeID element_epilogue;
+
+  /// Describes the structure of parallel reductions
+  SplitKMode split_k_mode;
+
+  /// Transformation on A operand
+  ComplexTransform transform_A;
+
+  //
+  // Methods
+  //
+
+  TrmmDescription(
+    TrmmKind trmm_kind = TrmmKind::kUniversal,
+    TensorDescription const &A = TensorDescription(),
+    SideMode side_mode = SideMode::kInvalid,
+    FillMode fill_mode = FillMode::kInvalid,
+    DiagType daig_type = DiagType::kInvalid,
+    TensorDescription const &B = TensorDescription(),
+    TensorDescription const &D = TensorDescription(),
+    NumericTypeID element_epilogue = NumericTypeID::kInvalid,
+    SplitKMode split_k_mode = SplitKMode::kNone,
+    ComplexTransform transform_A = ComplexTransform::kNone
+  ):
+    trmm_kind(trmm_kind),
+    A(A),
+    side_mode(side_mode),
+    fill_mode(fill_mode),
+    diag_type(diag_type),
+    B(B),
+    D(D),
+    element_epilogue(element_epilogue),
+    split_k_mode(split_k_mode),
+    transform_A(transform_A) {} 
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Description of all SYMM/HEMM update computations
+struct SymmDescription : public OperationDescription {
+
+  /// Indicates which device template is used (universal or regular)
+  SymmKind symm_kind;
+  
+  /// Describes the A operand
+  TensorDescription A;
+
+  /// Describes the B operand 
+  TensorDescription B;
+
+  /// Describes the source and destination matrices
+  TensorDescription C;
+
+  /// Describes the side mode for matrix A
+  SideMode side_mode;
+
+  /// Describes the fill mode for matrix A
+  FillMode fill_mode;
+
+  /// Describes the blas mode (symmetric/hermitian)
+  BlasMode blas_mode;
+
+  /// Describes the data type of the scalars passed to the epilogue
+  NumericTypeID element_epilogue;
+
+  /// Describes the structure of parallel reductions
+  SplitKMode split_k_mode;
+
+  /// Transformation on A operand
+  ComplexTransform transform_A;
+
+  /// Transformation on B operand
+  ComplexTransform transform_B;
+
+  //
+  // Methods
+  //
+
+  SymmDescription(
+    SymmKind symm_kind = SymmKind::kUniversal,
+    TensorDescription const &A = TensorDescription(),
+    TensorDescription const &B = TensorDescription(),
+    TensorDescription const &C = TensorDescription(),
+    SideMode side_mode = SideMode::kInvalid,
+    FillMode fill_mode = FillMode::kInvalid,
+    BlasMode blas_mode = BlasMode::kInvalid,
+    NumericTypeID element_epilogue = NumericTypeID::kInvalid,
+    SplitKMode split_k_mode = SplitKMode::kNone,
+    ComplexTransform transform_A = ComplexTransform::kNone,
+    ComplexTransform transform_B = ComplexTransform::kNone
+  ):
+    symm_kind(symm_kind),
+    A(A),
+    B(B),
+    C(C),
+    side_mode(side_mode),
+    fill_mode(fill_mode),
+    blas_mode(blas_mode),
+    element_epilogue(element_epilogue),
+    split_k_mode(split_k_mode),
+    transform_A(transform_A),
+    transform_B(transform_B) {} 
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Description of all Conv2d operations
@@ -877,6 +1108,39 @@ struct GemmPlanarComplexArrayArguments {
   ScalarPointerMode pointer_mode;
 };
 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Grouped GEMM supporting
+//
+// OperationKind: Gemm
+// GemmKind:      Grouped
+
+struct GemmGroupedConfiguration {
+
+  int problem_count;
+  int threadblock_count;
+
+};
+
+struct GemmGroupedArguments {
+
+  gemm::GemmCoord *problem_sizes;
+
+  void * ptr_A;
+  void * ptr_B;
+  void * ptr_C;
+  void * ptr_D;
+
+  int64_t *lda;
+  int64_t *ldb;
+  int64_t *ldc;
+  int64_t *ldd;
+
+  void const *alpha;
+  void const *beta;
+  ScalarPointerMode pointer_mode;
+};
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 //
 // OperationKind: kSparseGemm
@@ -919,6 +1183,172 @@ struct SparseGemmArguments {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+/// Configuration for basic Rank K update operations
+//
+// OperationKind: (Syrk, Herk, Syr2k, Her2k)
+// RankKKind:      Universal
+//
+struct RankKConfiguration {
+
+  /// SYRK problem size
+  gemm::GemmCoord problem_size;
+
+  /// Leading dimension of A matrix
+  int64_t lda;
+
+  /// Leading dimension of B matrix
+  int64_t ldb;
+
+  /// Leading dimension of C matrix
+  int64_t ldc;
+
+  /// Leading dimension of D matrix
+  int64_t ldd;
+
+  /// Batch Count
+  int batch_count;
+};
+
+/// Arguments for (Syrk, Herk, Syr2k, Her2k)
+struct RankKArguments {
+
+  /// Pointer to A matrix
+  void const *A;
+
+  /// Pointer to B matrix (used only for Syr2k and Her2k)
+  void const *B;
+
+  /// Pointer to C matrix
+  void const *C;
+
+  /// Pointer to D matrix
+  void *D;
+
+  /// Host or device pointer to alpha scalar
+  void const *alpha;
+
+  /// Host or device pointer to beta scalar
+  void const *beta;
+
+  /// Enumerant indicating whether alpha/beta point to host or device memory
+  ScalarPointerMode pointer_mode;
+
+  int64_t batch_stride_A;
+  int64_t batch_stride_B;
+  int64_t batch_stride_C;
+  int64_t batch_stride_D;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Configuration for basic TRMM operations
+//
+// OperationKind: Trmm
+// TrmmKind:      Universal
+//
+struct TrmmConfiguration {
+
+  /// TRMM problem size
+  gemm::GemmCoord problem_size;
+
+  /// Leading dimension of A matrix
+  int64_t lda;
+
+  /// Leading dimension of B matrix
+  int64_t ldb;
+
+  /// Leading dimension of D matrix
+  int64_t ldd;
+
+  /// Batch Count
+  int batch_count;
+};
+
+/// Arguments for TRMM
+struct TrmmArguments {
+
+  /// Pointer to A matrix
+  void const *A;
+
+  /// Pointer to B matrix
+  void const *B;
+
+  /// Pointer to D matrix
+  void *D;
+
+  /// Host or device pointer to alpha scalar
+  void const *alpha;
+
+  /// Host or device pointer to beta scalar
+  void const *beta;
+
+  /// Enumerant indicating whether alpha/beta point to host or device memory
+  ScalarPointerMode pointer_mode;
+
+  int64_t batch_stride_A;
+  int64_t batch_stride_B;
+  int64_t batch_stride_D;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Configuration for basic SYMM/HEMM update operations
+//
+// OperationKind: (Symm, Hemm)
+// SymmKind:      Universal
+//
+struct SymmConfiguration {
+
+  /// SYMM/HEMM problem size
+  gemm::GemmCoord problem_size;
+
+  /// Leading dimension of A matrix
+  int64_t lda;
+
+  /// Leading dimension of B matrix
+  int64_t ldb;
+
+  /// Leading dimension of C matrix
+  int64_t ldc;
+
+  /// Leading dimension of D matrix
+  int64_t ldd;
+
+  /// Batch Count
+  int batch_count;
+};
+
+/// Arguments for (Symm, Hemm)
+struct SymmArguments {
+
+  /// Pointer to A matrix
+  void const *A;
+
+  /// Pointer to B matrix
+  void const *B;
+
+  /// Pointer to C matrix
+  void const *C;
+
+  /// Pointer to D matrix
+  void *D;
+
+  /// Host or device pointer to alpha scalar
+  void const *alpha;
+
+  /// Host or device pointer to beta scalar
+  void const *beta;
+
+  /// Enumerant indicating whether alpha/beta point to host or device memory
+  ScalarPointerMode pointer_mode;
+
+  int64_t batch_stride_A;
+  int64_t batch_stride_B;
+  int64_t batch_stride_C;
+  int64_t batch_stride_D;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Two dimensional convolution
diff --git a/tools/library/include/cutlass/library/manifest.h b/tools/library/include/cutlass/library/manifest.h
index 78a49a13..d0943eeb 100644
--- a/tools/library/include/cutlass/library/manifest.h
+++ b/tools/library/include/cutlass/library/manifest.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/library/include/cutlass/library/operation_table.h b/tools/library/include/cutlass/library/operation_table.h
index 3c5c3c75..9d542360 100644
--- a/tools/library/include/cutlass/library/operation_table.h
+++ b/tools/library/include/cutlass/library/operation_table.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/library/include/cutlass/library/singleton.h b/tools/library/include/cutlass/library/singleton.h
index 96786cf5..e58ed1db 100644
--- a/tools/library/include/cutlass/library/singleton.h
+++ b/tools/library/include/cutlass/library/singleton.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/library/include/cutlass/library/util.h b/tools/library/include/cutlass/library/util.h
index 3b8e340d..b70e7e16 100644
--- a/tools/library/include/cutlass/library/util.h
+++ b/tools/library/include/cutlass/library/util.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -52,6 +58,27 @@ template <> Provider from_string<Provider>(std::string const &str);
 /// Converts a GemmKind enumerant to a string
 char const *to_string(GemmKind type, bool pretty = false);
 
+/// Converts a RankKKind enumerant to a string
+char const *to_string(RankKKind type, bool pretty = false);
+
+/// Converts a TrmmKind enumerant to a string
+char const *to_string(TrmmKind type, bool pretty = false);
+
+/// Converts a SymmKind enumerant to a string
+char const *to_string(SymmKind type, bool pretty = false);
+
+/// Converts a SideMode enumerant to a string
+char const *to_string(SideMode type, bool pretty = false);
+
+/// Converts a FillMode enumerant to a string
+char const *to_string(FillMode type, bool pretty = false);
+
+/// Converts a BlasMode enumerant to a string
+char const *to_string(BlasMode type, bool pretty = false);
+
+/// Converts a DiagType enumerant to a string
+char const *to_string(DiagType type, bool pretty = false);
+
 /// Converts a NumericType enumerant to a string
 char const *to_string(OperationKind type, bool pretty = false);
 
diff --git a/tools/library/scripts/gemm_operation.py b/tools/library/scripts/gemm_operation.py
index 2914d30a..095fbb91 100644
--- a/tools/library/scripts/gemm_operation.py
+++ b/tools/library/scripts/gemm_operation.py
@@ -9,10 +9,10 @@
 import shutil
 import functools
 import operator
+import collections
 
 from library import *
 
-
 ###################################################################################################
 #
 # Data structure modeling a GEMM operation
@@ -159,7 +159,9 @@ def configuration_name(self):
 class EmitGemmInstance:
   ''' Responsible for emitting a CUTLASS template definition'''
 
-  def __init__(self):
+  def __init__(self, operation_suffix = ''):
+    self.operation_suffix = operation_suffix
+    self.includes = []
     self.gemm_template = """
   // Gemm operator ${operation_name}
   using Operation_${operation_name} = cutlass::gemm::device::Gemm<
@@ -214,6 +216,15 @@ def __init__(self):
   >;
 """
 
+  #
+  def instance_template(self):
+    return """
+${compile_guard_start}
+  manifest.append(new ${gemm_kind}<Operation_${operation_name}>("${operation_name}"));
+${compile_guard_end}
+"""
+
+  #
   def emit(self, operation):
 
     warp_shape = [operation.tile_description.threadblock_shape[idx] // operation.tile_description.warp_count[idx] for idx in range(3)]
@@ -264,7 +275,9 @@ def emit(self, operation):
 class EmitSparseGemmInstance:
   ''' Responsible for emitting a CUTLASS template definition'''
 
-  def __init__(self):
+  def __init__(self, operation_suffix = ''):
+    self.operation_suffix = operation_suffix
+    self.includes = []
     self.gemm_template = """
   // Gemm operator ${operation_name}
   using Operation_${operation_name} = cutlass::gemm::device::SparseGemm<
@@ -293,6 +306,15 @@ def __init__(self):
   >;
 """
 
+  #
+  def instance_template(self):
+    return """
+${compile_guard_start}
+  manifest.append(new ${gemm_kind}<Operation_${operation_name}>("${operation_name}"));
+${compile_guard_end}
+"""
+
+  #
   def emit(self, operation):
 
     warp_shape = [operation.tile_description.threadblock_shape[idx] // operation.tile_description.warp_count[idx] for idx in range(3)]
@@ -345,7 +367,26 @@ def emit(self, operation):
 class EmitGemmUniversalInstance:
   ''' Responsible for emitting a CUTLASS template definition'''
 
-  def __init__(self):
+  def __init__(self, operation_suffix = ''):
+    self.operation_suffix = operation_suffix
+    self.includes = [
+      "cutlass/cutlass.h",
+      "cutlass/numeric_types.h",
+      "cutlass/arch/arch.h",
+      "cutlass/arch/mma.h",
+      "cutlass/layout/matrix.h",
+      "cutlass/gemm/device/gemm.h",
+      "cutlass/gemm/device/gemm_universal_adapter.h",
+      "cutlass/gemm/kernel/default_gemm_universal.h",
+    ]
+    self.builtin_epilogue_functor_template = """
+    ${epilogue_functor}<
+      ${element_c},
+      ${epilogue_vector_length},
+      ${element_accumulator},
+      ${element_epilogue}
+    >
+"""
     self.gemm_template = """
 // Gemm operator ${operation_name}
 using ${operation_name}_base = 
@@ -359,19 +400,14 @@ def __init__(self):
     cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
     cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
     cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
-    ${epilogue_functor}<
-      ${element_c},
-      ${epilogue_vector_length},
-      ${element_accumulator},
-      ${element_epilogue}
-    >,
+    ${epilogue_functor},
     ${swizzling_functor},
     ${stages},
     ${math_operation}
 >::GemmKernel;
 
 // Define named type
-struct ${operation_name} : 
+struct ${operation_name}${operation_suffix} : 
   public ${operation_name}_base { };
 """
     self.gemm_template_interleaved = """
@@ -387,22 +423,28 @@ def __init__(self):
     cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
     cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
     cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
-    ${epilogue_functor}<
-      ${element_c},
-      ${epilogue_vector_length},
-      ${element_accumulator},
-      ${element_epilogue}
-    >,
+    ${epilogue_functor},
     ${swizzling_functor},
     ${stages},
     ${math_operation}
 >::GemmKernel;
 
 // Define named type
-struct ${operation_name} : 
+struct ${operation_name}${operation_suffix} : 
   public ${operation_name}_base { };
 """
 
+  #
+  def instance_template(self):
+    return """
+${compile_guard_start}
+  manifest.append(new ${gemm_kind}<
+      cutlass::gemm::device::GemmUniversalAdapter<${operation_name}>
+    >("${operation_name}"));
+${compile_guard_end}
+"""
+
+  #
   def emit(self, operation):
 
     threadblock_shape = operation.tile_description.threadblock_shape
@@ -410,8 +452,6 @@ def emit(self, operation):
 
     warp_shape = [threadblock_shape[idx] // warp_count[idx] for idx in range(3)]
 
-    epilogue_vector_length = int(min(operation.C.alignment * DataTypeSize[operation.C.element], 128) / DataTypeSize[operation.C.element])
-
     transpose_layouts = {
       LayoutType.ColumnMajor: LayoutType.RowMajor,
       LayoutType.RowMajor: LayoutType.ColumnMajor
@@ -433,8 +473,25 @@ def emit(self, operation):
       gemm_template = self.gemm_template_interleaved
     #
 
+    # Support built-in epilogue functors or user-defined functions
+    if isinstance(operation.epilogue_functor, enum.Enum):
+
+      epilogue_vector_length = \
+        min(operation.C.alignment * DataTypeSize[operation.C.element], 128) // DataTypeSize[operation.C.element]
+
+      values = {
+        'epilogue_vector_length': str(epilogue_vector_length),
+        'element_epilogue': str(DataTypeTag[operation.element_epilogue]),
+        'epilogue_functor': EpilogueFunctorTag[operation.epilogue_functor],
+      }
+      epilogue_functor = SubstituteTemplate(self.builtin_epilogue_functor_template, values)
+    else:
+      epilogue_functor = self.epilogue_functor.emit_declaration()
+    #
+
     values = {
       'operation_name': operation.procedural_name(),
+      'operation_suffix': self.operation_suffix,
       'element_a': DataTypeTag[operation.A.element],
       'layout_a': LayoutTag[instance_layout_A],
       'element_b': DataTypeTag[operation.B.element],
@@ -453,9 +510,7 @@ def emit(self, operation):
       'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]),
       'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]),
       'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]),
-      'epilogue_vector_length': str(epilogue_vector_length),
-      'element_epilogue': str(DataTypeTag[operation.element_epilogue]),
-      'epilogue_functor': EpilogueFunctorTag[operation.epilogue_functor],
+      'epilogue_functor': epilogue_functor,
       'swizzling_functor': SwizzlingFunctorTag[operation.swizzling_functor],
       'stages': str(operation.tile_description.stages),
       'align_a': str(operation.A.alignment),
@@ -473,7 +528,9 @@ def emit(self, operation):
 class EmitGemmPlanarComplexInstance:
   ''' Responsible for emitting a CUTLASS template definition'''
 
-  def __init__(self):
+  def __init__(self, operation_suffix = ''):
+    self.operation_suffix = operation_suffix
+    self.includes = []
     self.template = """
   // Gemm operator ${operation_name}
   using Operation_${operation_name} = typename cutlass::gemm::kernel::DefaultGemmPlanarComplexUniversal<
@@ -501,6 +558,17 @@ def __init__(self):
     public Operation_${operation_name} { };
 """
 
+  #
+  def instance_template(self):
+    return """
+${compile_guard_start}
+  manifest.append(new ${gemm_kind}<
+    cutlass::gemm::device::GemmUniversalAdapter<${operation_name}>
+  >("${operation_name}"));
+${compile_guard_end}
+"""
+
+  #
   def emit(self, operation):
 
     warp_shape = [operation.tile_description.threadblock_shape[idx] // operation.tile_description.warp_count[idx] for idx in range(3)]
@@ -547,7 +615,9 @@ def emit(self, operation):
 class EmitGemmPlanarComplexArrayInstance:
   ''' Responsible for emitting a CUTLASS template definition'''
 
-  def __init__(self):
+  def __init__(self, operation_suffix = ''):
+    self.operation_suffix = operation_suffix
+    self.includes = []
     self.template = """
   // Gemm operator ${operation_name}
   using Operation_${operation_name} = typename cutlass::gemm::kernel::DefaultGemmPlanarComplexUniversal<
@@ -574,6 +644,17 @@ def __init__(self):
   struct ${operation_name} : public Operation_${operation_name} { };
 """
 
+  #
+  def instance_template(self):
+    return """
+${compile_guard_start}
+  manifest.append(new ${gemm_kind}<
+    cutlass::gemm::device::GemmUniversalAdapter<${operation_name}>
+  >("${operation_name}"));
+${compile_guard_end}
+"""
+
+  #
   def emit(self, operation):
 
     warp_shape = [operation.tile_description.threadblock_shape[idx] // operation.tile_description.warp_count[idx] for idx in range(3)]
@@ -616,6 +697,130 @@ def emit(self, operation):
 
 ###################################################################################################
 
+#
+class EmitGemmGroupedInstance:
+  ''' Responsible for emitting a CUTLASS template definition'''
+
+  def __init__(self, operation_suffix = ''):
+    self.operation_suffix = operation_suffix
+    self.includes = [
+      "cutlass/cutlass.h",
+      "cutlass/numeric_types.h",
+      "cutlass/arch/arch.h",
+      "cutlass/arch/mma.h",
+      "cutlass/layout/matrix.h",
+      "cutlass/gemm/device/gemm.h",
+      "cutlass/gemm/kernel/gemm_grouped.h",
+      "cutlass/gemm/kernel/default_gemm_grouped.h",
+      "cutlass/gemm/device/gemm_grouped.h"
+    ]
+    self.builtin_epilogue_functor_template = """
+    ${epilogue_functor}<
+      ${element_c},
+      ${epilogue_vector_length},
+      ${element_accumulator},
+      ${element_epilogue}
+    >
+"""
+    self.gemm_template = """
+// Gemm operator ${operation_name}
+using ${operation_name}_base =
+  typename cutlass::gemm::kernel::DefaultGemmGrouped<
+    ${element_a}, ${layout_a}, ${transform_a}, ${align_a},
+    ${element_b}, ${layout_b}, ${transform_b}, ${align_b},
+    ${element_c}, ${layout_c},
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${epilogue_functor},
+    ${swizzling_functor},
+    ${stages},
+    ${math_operation}
+>::GemmKernel;
+
+// Define named type
+struct ${operation_name}${operation_suffix} :
+  public ${operation_name}_base { };
+"""
+
+  #
+  def instance_template(self):
+    return """
+${compile_guard_start}
+  manifest.append(new ${gemm_kind}<
+    cutlass::gemm::device::GemmGrouped<${operation_name}>
+  >("${operation_name}"));
+${compile_guard_end}
+"""
+
+  #
+  def emit(self, operation):
+
+    threadblock_shape = operation.tile_description.threadblock_shape
+    warp_count = operation.tile_description.warp_count
+
+    warp_shape = [threadblock_shape[idx] // warp_count[idx] for idx in range(3)]
+
+    transpose_layouts = {
+      LayoutType.ColumnMajor: LayoutType.RowMajor,
+      LayoutType.RowMajor: LayoutType.ColumnMajor
+    }
+
+    instance_layout_A, instance_layout_B, instance_layout_C = \
+      (operation.A.layout, operation.B.layout, operation.C.layout)
+    #
+
+    # Support built-in epilogue functors or user-defined functions
+    if isinstance(operation.epilogue_functor, enum.Enum):
+
+      epilogue_vector_length = \
+        min(operation.C.alignment * DataTypeSize[operation.C.element], 128) // DataTypeSize[operation.C.element]
+
+      values = {
+        'epilogue_vector_length': str(epilogue_vector_length),
+        'element_epilogue': str(DataTypeTag[operation.element_epilogue]),
+        'epilogue_functor': EpilogueFunctorTag[operation.epilogue_functor],
+      }
+      epilogue_functor = SubstituteTemplate(self.builtin_epilogue_functor_template, values)
+    else:
+      epilogue_functor = self.epilogue_functor.emit_declaration()
+    #
+
+    values = {
+      'operation_name': operation.procedural_name(),
+      'operation_suffix': self.operation_suffix,
+      'element_a': DataTypeTag[operation.A.element],
+      'layout_a': LayoutTag[instance_layout_A],
+      'element_b': DataTypeTag[operation.B.element],
+      'layout_b': LayoutTag[instance_layout_B],
+      'element_c': DataTypeTag[operation.C.element],
+      'layout_c': LayoutTag[instance_layout_C],
+      'element_accumulator': DataTypeTag[operation.accumulator_type()],
+      'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
+      'arch': "cutlass::arch::Sm%d" % operation.arch,
+      'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]),
+      'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]),
+      'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]),
+      'warp_shape_m': str(warp_shape[0]),
+      'warp_shape_n': str(warp_shape[1]),
+      'warp_shape_k': str(warp_shape[2]),
+      'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]),
+      'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]),
+      'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]),
+      'epilogue_functor': epilogue_functor,
+      'swizzling_functor': SwizzlingFunctorTag[operation.swizzling_functor],
+      'stages': str(operation.tile_description.stages),
+      'align_a': str(operation.A.alignment),
+      'align_b': str(operation.B.alignment),
+      'transform_a': ComplexTransformTag[operation.A.complex_transform],
+      'transform_b': ComplexTransformTag[operation.B.complex_transform],
+      'math_operation': MathOperationTag[operation.tile_description.math_instruction.math_operation]
+    }
+
+    return SubstituteTemplate(self.gemm_template, values)
 
 ###################################################################################################
 #
@@ -633,7 +838,8 @@ def __init__(self, operation_path, configuration_name):
       GemmKind.Sparse: EmitSparseGemmInstance,
       GemmKind.Universal: EmitGemmUniversalInstance,
       GemmKind.PlanarComplex: EmitGemmPlanarComplexInstance,
-      GemmKind.PlanarComplexArray: EmitGemmPlanarComplexArrayInstance
+      GemmKind.PlanarComplexArray: EmitGemmPlanarComplexArrayInstance,
+      GemmKind.Grouped: EmitGemmGroupedInstance
     }
 
     self.gemm_kind_wrappers = {
@@ -641,61 +847,21 @@ def __init__(self, operation_path, configuration_name):
       GemmKind.Sparse: 'GemmSparseOperation',
       GemmKind.Universal: 'GemmUniversalOperation',
       GemmKind.PlanarComplex: 'GemmPlanarComplexOperation',
-      GemmKind.PlanarComplexArray: 'GemmPlanarComplexArrayOperation'
+      GemmKind.PlanarComplexArray: 'GemmPlanarComplexArrayOperation',
+      GemmKind.Grouped: 'GemmGroupedOperation'
     }
 
     self.wmma_guard_start = "#if defined(CUTLASS_ARCH_WMMA_SM${sm_number}_ENABLED)"
 
-    self.instance_template = {
-      GemmKind.Gemm: """
-${compile_guard_start}
-  manifest.append(new ${gemm_kind}<Operation_${operation_name}>("${operation_name}"));
-${compile_guard_end}
-""",
-      GemmKind.Sparse: """
-${compile_guard_start}
-  manifest.append(new ${gemm_kind}<Operation_${operation_name}>("${operation_name}"));
-${compile_guard_end}
-""",
-      GemmKind.Universal: """
-${compile_guard_start}
-  manifest.append(new ${gemm_kind}<
-      cutlass::gemm::device::GemmUniversalAdapter<${operation_name}>
-    >("${operation_name}"));
-${compile_guard_end}
-""",
-      GemmKind.PlanarComplex: """
-${compile_guard_start}
-  manifest.append(new ${gemm_kind}<
-    cutlass::gemm::device::GemmUniversalAdapter<${operation_name}>
-  >("${operation_name}"));
-${compile_guard_end}
-""",
-      GemmKind.PlanarComplexArray: """
-${compile_guard_start}
-  manifest.append(new ${gemm_kind}<
-    cutlass::gemm::device::GemmUniversalAdapter<${operation_name}>
-  >("${operation_name}"));
-${compile_guard_end}
+    self.separator = """
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
 """
-    }
 
     self.header_template = """
 /*
   Generated by gemm_operation.py - Do not edit.
 */
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-#include "cutlass/arch/wmma.h"
-#include "cutlass/cutlass.h"
-#include "cutlass/library/library.h"
-#include "cutlass/library/manifest.h"
-
-#include "library_internal.h"
-#include "gemm_operation.h"
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
 """
 
     self.initialize_function_template = """
@@ -726,7 +892,16 @@ def __init__(self, operation_path, configuration_name):
   def __enter__(self):
     self.configuration_file = open(self.configuration_path, "w")
     self.configuration_file.write(self.header_template)
-
+    self.configuration_file.write(self.separator)
+
+    self.includes = collections.OrderedDict([
+      ("cutlass/cutlass.h", None),
+      ("cutlass/library/library.h", None),
+      ("cutlass/library/manifest.h", None),
+      ("library_internal.h", None),
+      ("gemm_operation.h", None),
+      ("cutlass/arch/wmma.h", None),
+    ])
     self.instance_definitions = []
     self.instance_wrappers = []
 
@@ -736,11 +911,14 @@ def __enter__(self):
   def emit(self, operation):
     emitter = self.instance_emitter[operation.gemm_kind]()
 
+    for incl in emitter.includes:
+      self.includes[incl] = None
+
     self.operations.append(operation)
 
     self.instance_definitions.append(emitter.emit(operation))
 
-    self.instance_wrappers.append(SubstituteTemplate(self.instance_template[operation.gemm_kind], {
+    self.instance_wrappers.append(SubstituteTemplate(emitter.instance_template(), {
       'configuration_name': self.configuration_name,
       'operation_name': operation.procedural_name(),
       'gemm_kind': self.gemm_kind_wrappers[operation.gemm_kind],
@@ -752,6 +930,13 @@ def emit(self, operation):
 
   def __exit__(self, exception_type, exception_value, traceback):
 
+    # Write includes
+    for incl, _ in self.includes.items():
+      include_statement = "#include \"%s\"\n" % incl
+      self.configuration_file.write(include_statement)
+
+    self.configuration_file.write(self.separator)
+
     # Write instance definitions in top-level namespace
     for instance_definition in self.instance_definitions:
       self.configuration_file.write(instance_definition)
diff --git a/tools/library/scripts/generator.py b/tools/library/scripts/generator.py
index bc5b599e..1f2301f1 100644
--- a/tools/library/scripts/generator.py
+++ b/tools/library/scripts/generator.py
@@ -32,6 +32,19 @@ def CudaToolkitVersionSatisfies(semantic_ver_string, major, minor, patch = 0):
 ###################################################################################################
 ###################################################################################################
 
+#
+def EpilogueAlignment(max_alignment, tile, epilogue_steps = 8):
+  ''' Helper to compute the maximum alignment of the epilogue '''
+
+  def product(X, identity = 1):
+    result = identity
+    for item in X:
+      result *= item
+    return result
+
+  elements_per_thread = product(tile.threadblock_shape[:-1]) // product(tile.warp_count) // 32 // epilogue_steps
+  return min(max_alignment, elements_per_thread)
+
 #
 def CreateGemmOperator(manifest, layouts, tile_descriptions, data_type, \
   alignment_constraints, complex_transforms = None, epilogue_functor = EpilogueFunctor.LinearCombination, \
@@ -45,7 +58,7 @@ def CreateGemmOperator(manifest, layouts, tile_descriptions, data_type, \
   operations = []
 
   # by default, only generate the largest tile and largest alignment
-  if manifest.args.kernels == '':
+  if manifest.kernel_filter == '':
     tile_descriptions = [tile_descriptions[0],]
     alignment_constraints = [alignment_constraints[0],]
 
@@ -83,7 +96,7 @@ def CreateSparseGemmOperator(manifest, layouts, tile_descriptions, data_type, \
   operations = []
 
   # by default, only generate the largest tile and largest alignment
-  if manifest.args.kernels == '':
+  if manifest.kernel_filter == '':
     tile_descriptions = [tile_descriptions[0],]
     alignment_constraints = [alignment_constraints[0],]
 
@@ -118,7 +131,7 @@ def CreateGemmPlanarComplexOperator(manifest, layouts, tile_descriptions, data_t
   gemm_kinds = [GemmKind.PlanarComplex, GemmKind.PlanarComplexArray]
   
   # by default, only generate the largest tile and largest alignment
-  if manifest.args.kernels == '':
+  if manifest.kernel_filter == '':
     tile_descriptions = [tile_descriptions[0],]
     alignment_constraints = [alignment_constraints[0],]
   
@@ -139,6 +152,176 @@ def CreateGemmPlanarComplexOperator(manifest, layouts, tile_descriptions, data_t
               tile_description, A, B, C, element_epilogue))
   return
 
+#
+def CreateGemmGroupedOperator(manifest, layouts, tile_descriptions, data_type, \
+  alignment_constraints, complex_transforms = None, epilogue_functor = EpilogueFunctor.LinearCombination, \
+  swizzling_functor = SwizzlingFunctor.Identity8):
+
+  if complex_transforms is None:
+    complex_transforms = [(ComplexTransform.none, ComplexTransform.none),]
+
+  element_a, element_b, element_c, element_epilogue = data_type
+
+  operations = []
+
+  # by default, only generate the largest tile and largest alignment
+  if manifest.kernel_filter == '':
+    tile_descriptions = [tile_descriptions[0],]
+    alignment_constraints = [alignment_constraints[0],]
+
+  for layout in layouts:
+    for tile_description in tile_descriptions:
+      for alignment in alignment_constraints:
+        for complex_transform in complex_transforms:
+
+            alignment_c = min(8, alignment)
+
+            A = TensorDescription(element_a, layout[0], alignment, complex_transform[0])
+            B = TensorDescription(element_b, layout[1], alignment, complex_transform[1])
+            C = TensorDescription(element_c, layout[2], alignment_c)
+
+            new_operation = GemmOperation(GemmKind.Grouped, tile_description.minimum_compute_capability, \
+              tile_description, A, B, C, element_epilogue, epilogue_functor, swizzling_functor)
+
+            manifest.append(new_operation)
+            operations.append(new_operation)
+
+  return operations
+
+#
+def CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, data_type, \
+  alignment_constraints, blas_mode, epilogue_functor = EpilogueFunctor.LinearCombination, \
+  swizzling_functor = SwizzlingFunctor.Identity8):
+
+  element_a, element_c, element_epilogue = data_type
+  
+  operations = []
+
+  # by default, only generate the largest tile and largest alignment
+  if manifest.kernel_filter == '':
+    tile_descriptions = [tile_descriptions[0],]
+    alignment_constraints = [alignment_constraints[0],]
+
+  for layout in layouts:
+    for fill_mode in fill_modes:
+      for tile_description in tile_descriptions:
+        for alignment in alignment_constraints:
+          
+          # SERK supported layouts (RowMajor, ColumnMajor) with no conjugation
+          complex_transform = ComplexTransform.none
+
+          # HERK supported layouts (RowMajor + conj, ColumnMajor)
+          if blas_mode == BlasMode.hermitian and layout[0] == LayoutType.RowMajor:
+            complex_transform = ComplexTransform.conj
+
+          alignment_c = 1 # Alignment only applies to A in SYRK
+   
+          A = TensorDescription(element_a, layout[0], alignment, complex_transform)
+          C = SymmetricTensorDescription(element_c, layout[1], fill_mode, alignment_c)
+
+          # Rank-K update
+          new_operation = RankKOperation(RankKKind.Universal, tile_description.minimum_compute_capability, \
+            tile_description, A, C, element_epilogue, epilogue_functor, swizzling_functor, blas_mode)
+
+          manifest.append(new_operation)
+          operations.append(new_operation)
+          
+          # Rank-2K update
+          new_operation = Rank2KOperation(RankKKind.Universal, tile_description.minimum_compute_capability, \
+            tile_description, A, C, element_epilogue, epilogue_functor, swizzling_functor, blas_mode)
+
+          manifest.append(new_operation)
+          operations.append(new_operation)
+
+  return operations
+
+#
+def CreateTrmmOperator(manifest, layouts, side_modes, fill_modes, diag_types, tile_descriptions, data_type, \
+  alignment_constraints, complex_transforms = None, epilogue_functor = EpilogueFunctor.LinearCombination, \
+  swizzling_functor = SwizzlingFunctor.Identity8):
+
+  if complex_transforms is None:
+    complex_transforms = [(ComplexTransform.none),]
+
+  element_a, element_b, element_c, element_epilogue = data_type
+  
+  operations = []
+
+  # by default, only generate the largest tile and largest alignment
+  if manifest.kernel_filter == '':
+    tile_descriptions = [tile_descriptions[0],]
+    alignment_constraints = [alignment_constraints[0],]
+
+  for layout in layouts:
+    for side_mode in side_modes:
+      for fill_mode in fill_modes:
+        for diag_type in diag_types:
+          for tile_description in tile_descriptions:
+            for alignment in alignment_constraints:
+              for complex_transform in complex_transforms:
+        
+                  alignment_c = min(8, alignment)
+       
+                  A = TriangularTensorDescription(element_a, layout[0], side_mode, fill_mode, diag_type,
+                                                  alignment, complex_transform)
+                  B = TensorDescription(element_b, layout[1], alignment)
+                  C = TensorDescription(element_c, layout[2], alignment_c)
+
+                  new_operation = TrmmOperation(TrmmKind.Universal, tile_description.minimum_compute_capability, \
+                    tile_description, A, B, C, element_epilogue, epilogue_functor, swizzling_functor)
+
+                  manifest.append(new_operation)
+                  operations.append(new_operation)
+
+  return operations
+
+#
+def CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, data_type, \
+  alignment_constraints, blas_mode, epilogue_functor = EpilogueFunctor.LinearCombination, \
+  swizzling_functor = SwizzlingFunctor.Identity8):
+
+  element_a, element_b, element_c, element_epilogue = data_type
+  
+  operations = []
+
+  # by default, only generate the largest tile and largest alignment
+  if manifest.kernel_filter == '':
+    tile_descriptions = [tile_descriptions[0],]
+    alignment_constraints = [alignment_constraints[0],]
+
+  for layout in layouts:
+    for side_mode in side_modes:
+      for fill_mode in fill_modes:
+        for tile_description in tile_descriptions:
+          for alignment in alignment_constraints:
+            
+            # SYMM supported layouts (RowMajor, ColumnMajor) with no conjugation
+            complex_transform = ComplexTransform.none
+
+            alignment_a = 1 # No vectorized access for the triangular matrix
+            alignment_c = min(8, alignment)
+       
+            A = SymmetricTensorDescription(element_a, layout[0], fill_mode, alignment_a, complex_transform, side_mode)
+            # tensor A and B have same data type and layout
+            B = TensorDescription(element_b, layout[0], alignment)
+            C = TensorDescription(element_c, layout[1], alignment_c)
+
+            # SYMM/HEMM update
+            new_operation = SymmOperation(SymmKind.Universal, tile_description.minimum_compute_capability, \
+              tile_description, A, B, C, element_epilogue, epilogue_functor, swizzling_functor, blas_mode)
+
+            manifest.append(new_operation)
+            operations.append(new_operation)
+            
+            # SYMM/HEMM update
+            new_operation = SymmOperation(SymmKind.Universal, tile_description.minimum_compute_capability, \
+              tile_description, A, B, C, element_epilogue, epilogue_functor, swizzling_functor, blas_mode)
+
+            manifest.append(new_operation)
+            operations.append(new_operation)
+
+  return operations
+
 ###########################################################################################################
 #   ConvolutionOperator support variations
 #        ____________________________________________________________________
@@ -164,7 +347,7 @@ def CreateConv2dOperator(manifest, layout, tile_descriptions, data_type, alignme
   iterator_algorithms = [IteratorAlgorithm.Analytic, IteratorAlgorithm.Optimized]
 
   # by default, only generate the largest tile size, largest alignment, and optimized iterator
-  if manifest.args.kernels == '':
+  if manifest.kernel_filter == '':
     tile_descriptions = [tile_descriptions[0],]
     alignment_constraints = [alignment_constraints[0],]
     iterator_algorithms = [IteratorAlgorithm.Optimized]
@@ -244,6 +427,94 @@ def CreateConv2dOperator(manifest, layout, tile_descriptions, data_type, alignme
 
   return operations
 
+# Convolution for 2D operations specialized for few channels
+def CreateConv2dFixedChannelsOperator(manifest, layout, tile_descriptions, data_type, channel_counts, \
+  conv_kinds = [ConvKind.Fprop, ConvKind.Dgrad, ConvKind.Wgrad], \
+  epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity4):
+
+  element_a, element_b, element_c, element_epilogue = data_type
+
+  # one exceptional case
+
+  # iterator algorithm (analytic and optimized)
+  iterator_algorithms = [IteratorAlgorithm.FixedChannels,]
+
+  # by default, only generate the largest tile size, largest alignment, and optimized iterator
+  if manifest.kernel_filter == '':
+    tile_descriptions = [tile_descriptions[0],]
+    channel_counts = [channel_counts[0],]
+
+  operations = []
+
+
+
+  for tile in tile_descriptions:
+    for channel_count in channel_counts:
+
+      alignment_c = EpilogueAlignment(channel_count, tile)
+
+      A = TensorDescription(element_a, layout[0], channel_count)
+      B = TensorDescription(element_b, layout[1], channel_count)
+      C = TensorDescription(element_c, layout[2], alignment_c)
+
+      swizzling_functor_ = swizzling_functor
+
+      #
+      # Conv2d Fprop
+      #
+      if ConvKind.Fprop in conv_kinds:
+
+        # Strided support for Analytic and Optimized Fprop
+        for iterator_algorithm in iterator_algorithms:
+          new_operation = Conv2dOperation(ConvKind.Fprop, iterator_algorithm, tile.minimum_compute_capability, tile,\
+            A, B, C, element_epilogue, StrideSupport.Strided, epilogue_functor, swizzling_functor_)
+
+          manifest.append(new_operation)
+          operations.append(new_operation)
+
+
+# Convolution for 2D operations specialized for few channels
+def CreateConv2dFewChannelsOperator(manifest, layout, tile_descriptions, data_type, channel_counts, \
+  conv_kinds = [ConvKind.Fprop, ConvKind.Dgrad, ConvKind.Wgrad], \
+  epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity4):
+
+  element_a, element_b, element_c, element_epilogue = data_type
+
+  # one exceptional case
+
+  # iterator algorithm (analytic and optimized)
+  iterator_algorithms = [IteratorAlgorithm.FewChannels,]
+
+  # by default, only generate the largest tile size, largest alignment, and optimized iterator
+  if manifest.kernel_filter == '':
+    tile_descriptions = [tile_descriptions[0],]
+    channel_counts = [channel_counts[0],]
+
+  operations = []
+
+  for tile in tile_descriptions:
+    for channel_count in channel_counts:
+
+      alignment_c = EpilogueAlignment(channel_count, tile)
+
+      A = TensorDescription(element_a, layout[0], channel_count)
+      B = TensorDescription(element_b, layout[1], channel_count)
+      C = TensorDescription(element_c, layout[2], alignment_c)
+
+      swizzling_functor_ = swizzling_functor
+
+      #
+      # Conv2d Fprop
+      #
+      if ConvKind.Fprop in conv_kinds:
+
+        # Strided support for Analytic and Optimized Fprop
+        for iterator_algorithm in iterator_algorithms:
+          new_operation = Conv2dOperation(ConvKind.Fprop, iterator_algorithm, tile.minimum_compute_capability, tile,\
+            A, B, C, element_epilogue, StrideSupport.Strided, epilogue_functor, swizzling_functor_)
+
+          manifest.append(new_operation)
+          operations.append(new_operation)
 
 # Convolution for 3D operations
 def CreateConv3dOperator(manifest, layout, tile_descriptions, data_type, alignment, \
@@ -258,7 +529,7 @@ def CreateConv3dOperator(manifest, layout, tile_descriptions, data_type, alignme
   iterator_algorithms = [IteratorAlgorithm.Analytic, IteratorAlgorithm.Optimized]
 
   # by default, only generate the largest tile size and optimized iterators
-  if manifest.args.kernels == '':
+  if manifest.kernel_filter == '':
     tile_descriptions = [tile_descriptions[0],]
     iterator_algorithms = [IteratorAlgorithm.Optimized]
 
@@ -325,7 +596,7 @@ def CreateConv3dOperator(manifest, layout, tile_descriptions, data_type, alignme
 ###################################################################################################
 
 #
-def GenerateSM50_Simt(manifest, args):
+def GenerateSM50_Simt(manifest, cuda_version):
   layouts = [
     (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
     (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
@@ -377,7 +648,7 @@ def GenerateSM50_Simt(manifest, args):
 #
 
 #
-def GenerateSM50_Simt_complex(manifest, args):
+def GenerateSM50_Simt_complex(manifest, cuda_version):
   layouts = [
     (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
     (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
@@ -424,15 +695,15 @@ def GenerateSM50_Simt_complex(manifest, args):
 #
 
 #
-def GenerateSM50(manifest, args):
-  GenerateSM50_Simt(manifest, args)
-  GenerateSM50_Simt_complex(manifest, args)
+def GenerateSM50(manifest, cuda_version):
+  GenerateSM50_Simt(manifest, cuda_version)
+  GenerateSM50_Simt_complex(manifest, cuda_version)
 
 ###################################################################################################
 ###################################################################################################
 
 #
-def GenerateSM60_Simt(manifest, args):
+def GenerateSM60_Simt(manifest, cuda_version):
   layouts = [
     (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
     (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
@@ -477,14 +748,14 @@ def GenerateSM60_Simt(manifest, args):
 #
 
 #
-def GenerateSM60(manifest, args):
-  GenerateSM60_Simt(manifest, args)
+def GenerateSM60(manifest, cuda_version):
+  GenerateSM60_Simt(manifest, cuda_version)
 
 ###################################################################################################
 ###################################################################################################
 
 #
-def GenerateSM61_Simt(manifest, args):
+def GenerateSM61_Simt(manifest, cuda_version):
   layouts = [
     (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
     (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
@@ -536,16 +807,16 @@ def GenerateSM61_Simt(manifest, args):
 #
 
 #
-def GenerateSM61(manifest, args):
-  GenerateSM61_Simt(manifest, args)
+def GenerateSM61(manifest, cuda_version):
+  GenerateSM61_Simt(manifest, cuda_version)
 
 ###################################################################################################
 ###################################################################################################
 
 #
-def GenerateSM70_TensorOp_884(manifest, args):
+def GenerateSM70_TensorOp_884(manifest, cuda_version):
 
-  if not CudaToolkitVersionSatisfies(args.cuda_version, 10, 1):
+  if not CudaToolkitVersionSatisfies(cuda_version, 10, 1):
     return
 
   layouts = [
@@ -578,6 +849,8 @@ def GenerateSM70_TensorOp_884(manifest, args):
       TileDescription([256, 128, 32], 2, [4, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([128, 256, 32], 2, [2, 4, 1], math_inst, min_cc, max_cc),
       TileDescription([128, 128, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 32], 2, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 32], 2, [1, 4, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64, 128, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([128,  64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64,  64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
@@ -612,9 +885,9 @@ def GenerateSM70_TensorOp_884(manifest, args):
       CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type_mixed, alignment_constraints)
 
 #
-def GenerateSM70_PlanarComplexTensorOp_884(manifest, args):
+def GenerateSM70_PlanarComplexTensorOp_884(manifest, cuda_version):
 
-  if not CudaToolkitVersionSatisfies(args.cuda_version, 10, 1):
+  if not CudaToolkitVersionSatisfies(cuda_version, 10, 1):
     return
 
   layouts = [
@@ -679,7 +952,7 @@ def GenerateSM70_PlanarComplexTensorOp_884(manifest, args):
     
 
 #
-def GenerateSM70_WmmaTensorOp_161616(manifest, args):
+def GenerateSM70_WmmaTensorOp_161616(manifest, cuda_version):
 
   layouts = [
     (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
@@ -741,21 +1014,62 @@ def GenerateSM70_WmmaTensorOp_161616(manifest, args):
 ##################################################################################################
 #
 
-def GenerateSM70(manifest, args):
-  GenerateSM70_TensorOp_884(manifest, args)
-  GenerateSM70_PlanarComplexTensorOp_884(manifest, args)
+def GenerateSM70(manifest, cuda_version):
+  GenerateSM70_TensorOp_884(manifest, cuda_version)
+  GenerateSM70_PlanarComplexTensorOp_884(manifest, cuda_version)
 
   # To limit build size, WMMA GEMMs are disabled for now.
   #
-  #GenerateSM70_WmmaTensorOp_161616(manifest, args)
+  #GenerateSM70_WmmaTensorOp_161616(manifest, cuda_version)
 
 ###################################################################################################
 ###################################################################################################
 
 #
-def GenerateSM75_TensorOp_1688(manifest, args):
+def GenerateSM75_TensorOp_1688_FewChannels(manifest, cuda_version, math_inst):
+
+  min_cc = 75
+  max_cc = 1024
+
+  tile_descriptions = [
+    TileDescription([128,  64, 32], 2, [2, 4, 1], math_inst, min_cc, max_cc),
+    TileDescription([256,  64, 32], 2, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([128, 128, 32], 2, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([ 64, 256, 32], 2, [2, 4, 1], math_inst, min_cc, max_cc),
+    TileDescription([ 64, 128, 32], 2, [2, 4, 1], math_inst, min_cc, max_cc),
+    TileDescription([ 64,  64, 32], 2, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([ 64, 128, 64], 2, [2, 2, 2], math_inst, min_cc, max_cc),
+  ]
+
+  data_type = [
+    math_inst.element_a,
+    math_inst.element_b,
+    math_inst.element_accumulator,
+    math_inst.element_accumulator,
+  ]
+
+  conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
+
+  CreateConv2dFixedChannelsOperator(manifest, conv_layout, tile_descriptions, data_type, [4, 8])
+  CreateConv2dFewChannelsOperator(manifest, conv_layout, tile_descriptions, data_type, [1, 2, 4])
+
+  # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
+  if math_inst.element_a != math_inst.element_accumulator:
+
+    data_type_mixed = [
+      math_inst.element_a,
+      math_inst.element_b,
+      math_inst.element_a,
+      math_inst.element_accumulator,
+    ]
+
+    CreateConv2dFixedChannelsOperator(manifest, conv_layout, tile_descriptions, data_type_mixed, [4, 8])
+    CreateConv2dFewChannelsOperator(manifest, conv_layout, tile_descriptions, data_type_mixed, [1, 2, 4])
+
+#
+def GenerateSM75_TensorOp_1688(manifest, cuda_version):
 
-  if not CudaToolkitVersionSatisfies(args.cuda_version, 10, 2):
+  if not CudaToolkitVersionSatisfies(cuda_version, 10, 2):
     return
 
   layouts = [
@@ -788,6 +1102,8 @@ def GenerateSM75_TensorOp_1688(manifest, args):
       TileDescription([256, 128, 32], 2, [4, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([128, 256, 32], 2, [2, 4, 1], math_inst, min_cc, max_cc),
       TileDescription([128, 128, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 32], 2, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 32], 2, [4, 1, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64, 128, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([128,  64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64,  64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
@@ -805,6 +1121,7 @@ def GenerateSM75_TensorOp_1688(manifest, args):
       data_type, alignment_constraints)
 
     conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
+
     CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, alignment_constraints)
 
     # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
@@ -822,12 +1139,15 @@ def GenerateSM75_TensorOp_1688(manifest, args):
 
       CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type_mixed, alignment_constraints)
 
+    # Separate generator for 'few channels' specializations
+    GenerateSM75_TensorOp_1688_FewChannels(manifest, cuda_version, math_inst)
+
 #
 
 #
-def GenerateSM75_PlanarComplexTensorOp_1688(manifest, args):
+def GenerateSM75_PlanarComplexTensorOp_1688(manifest, cuda_version):
 
-  if not CudaToolkitVersionSatisfies(args.cuda_version, 10, 2):
+  if not CudaToolkitVersionSatisfies(cuda_version, 10, 2):
     return
 
   layouts = [
@@ -893,9 +1213,9 @@ def GenerateSM75_PlanarComplexTensorOp_1688(manifest, args):
         data_type_mixed, alignment_constraints, complex_transforms)
 
 #
-def GenerateSM75_TensorOp_8816_TN(manifest, args):
+def GenerateSM75_TensorOp_8816_TN(manifest, cuda_version):
 
-  if not CudaToolkitVersionSatisfies(args.cuda_version, 10, 2):
+  if not CudaToolkitVersionSatisfies(cuda_version, 10, 2):
     return
 
   layouts = [
@@ -940,11 +1260,11 @@ def GenerateSM75_TensorOp_8816_TN(manifest, args):
     ]
     
     CreateGemmOperator(manifest, layouts, tile_descriptions, \
-      data_type, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
+      data_type, alignment_constraints, None, EpilogueFunctor.LinearCombination)
 
     conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
     CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
-      data_type, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
+      data_type, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombination)
 
     # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
     if math_inst.element_a != math_inst.element_accumulator:
@@ -973,9 +1293,9 @@ def GenerateSM75_TensorOp_8816_TN(manifest, args):
 #
 
 #
-def GenerateSM75_TensorOp_8816_Interleaved(manifest, args):
+def GenerateSM75_TensorOp_8816_Interleaved(manifest, cuda_version):
 
-  if not CudaToolkitVersionSatisfies(args.cuda_version, 10, 2):
+  if not CudaToolkitVersionSatisfies(cuda_version, 10, 2):
     return
 
   layouts = [
@@ -1032,9 +1352,9 @@ def GenerateSM75_TensorOp_8816_Interleaved(manifest, args):
 #
 
 #
-def GenerateSM75_TensorOp_8832_TN(manifest, args):
+def GenerateSM75_TensorOp_8832_TN(manifest, cuda_version):
 
-  if not CudaToolkitVersionSatisfies(args.cuda_version, 10, 2):
+  if not CudaToolkitVersionSatisfies(cuda_version, 10, 2):
     return
 
   layouts = [
@@ -1063,6 +1383,8 @@ def GenerateSM75_TensorOp_8832_TN(manifest, args):
       TileDescription([256, 128, 128], 2, [4, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([128, 256, 128], 2, [2, 4, 1], math_inst, min_cc, max_cc),
       TileDescription([128, 128, 128], 2, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 128], 2, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 128], 2, [1, 4, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64, 128, 128], 2, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([128,  64, 128], 2, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64,  64, 128], 2, [2, 2, 1], math_inst, min_cc, max_cc),
@@ -1076,11 +1398,11 @@ def GenerateSM75_TensorOp_8832_TN(manifest, args):
     ]
     
     CreateGemmOperator(manifest, layouts, tile_descriptions, \
-      data_type, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
+      data_type, alignment_constraints, None, EpilogueFunctor.LinearCombination)
       
     conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
     CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
-      data_type, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
+      data_type, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombination)
 
     # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
     if math_inst.element_a != math_inst.element_accumulator:
@@ -1102,18 +1424,18 @@ def GenerateSM75_TensorOp_8832_TN(manifest, args):
 
       for op in operations:
         if op.tile_description.threadblock_shape[1] >= 128:
-          op.C.alignment = 8
+          op.C.alignment = 16
         elif op.tile_description.threadblock_shape[1] == 64:
           op.C.alignment = 8
         else:
-          op.C.alignment = 4
+          op.C.alignment = 8
 
 #
 
 #
-def GenerateSM75_TensorOp_8832_Interleaved(manifest, args):
+def GenerateSM75_TensorOp_8832_Interleaved(manifest, cuda_version):
 
-  if not CudaToolkitVersionSatisfies(args.cuda_version, 10, 2):
+  if not CudaToolkitVersionSatisfies(cuda_version, 10, 2):
     return
 
   layouts = [
@@ -1170,9 +1492,9 @@ def GenerateSM75_TensorOp_8832_Interleaved(manifest, args):
 #
 
 #
-def GenerateSM75_TensorOp_88128(manifest, args):
+def GenerateSM75_TensorOp_88128(manifest, cuda_version):
 
-  if not CudaToolkitVersionSatisfies(args.cuda_version, 11, 0):
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
     return
 
   layouts = [
@@ -1196,6 +1518,8 @@ def GenerateSM75_TensorOp_88128(manifest, args):
       TileDescription([256, 128, 512], 2, [4, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([128, 256, 512], 2, [2, 4, 1], math_inst, min_cc, max_cc),
       TileDescription([128, 128, 512], 2, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 512], 2, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 512], 2, [4, 1, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64, 128, 512], 2, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([128,  64, 512], 2, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64,  64, 512], 2, [2, 2, 1], math_inst, min_cc, max_cc),
@@ -1209,9 +1533,9 @@ def GenerateSM75_TensorOp_88128(manifest, args):
 #
 
 #
-def GenerateSM75_WmmaTensorOp_161616(manifest, args):
+def GenerateSM75_WmmaTensorOp_161616(manifest, cuda_version):
 
-  if not CudaToolkitVersionSatisfies(args.cuda_version, 10, 0):
+  if not CudaToolkitVersionSatisfies(cuda_version, 10, 0):
     return
 
   layouts = [
@@ -1267,7 +1591,7 @@ def GenerateSM75_WmmaTensorOp_161616(manifest, args):
 #
 
 #
-def GenerateSM75_Simt_complex(manifest, args):
+def GenerateSM75_Simt_complex(manifest, cuda_version):
   math_instructions = [
     MathInstruction(                                  \
       [1, 1, 1],                                      \
@@ -1303,25 +1627,25 @@ def GenerateSM75_Simt_complex(manifest, args):
     CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, alignment_constraints)
 #
 
-def GenerateSM75(manifest, args):
-  GenerateSM75_TensorOp_1688(manifest, args)
-  GenerateSM75_PlanarComplexTensorOp_1688(manifest, args)
-  GenerateSM75_TensorOp_8816_TN(manifest, args)
-  GenerateSM75_TensorOp_8816_Interleaved(manifest, args)
-  GenerateSM75_TensorOp_8832_TN(manifest, args)
-  GenerateSM75_TensorOp_8832_Interleaved(manifest, args)
-  GenerateSM75_TensorOp_88128(manifest, args)
-  #GenerateSM75_WmmaTensorOp_161616(manifest, args)
-  GenerateSM75_Simt_complex(manifest, args)
+def GenerateSM75(manifest, cuda_version):
+  GenerateSM75_TensorOp_1688(manifest, cuda_version)
+  GenerateSM75_PlanarComplexTensorOp_1688(manifest, cuda_version)
+  GenerateSM75_TensorOp_8816_TN(manifest, cuda_version)
+  GenerateSM75_TensorOp_8816_Interleaved(manifest, cuda_version)
+  GenerateSM75_TensorOp_8832_TN(manifest, cuda_version)
+  GenerateSM75_TensorOp_8832_Interleaved(manifest, cuda_version)
+  GenerateSM75_TensorOp_88128(manifest, cuda_version)
+  #GenerateSM75_WmmaTensorOp_161616(manifest, cuda_version)
+  GenerateSM75_Simt_complex(manifest, cuda_version)
 
 
 ###################################################################################################
 ###################################################################################################
 
 #
-def GenerateSM80_TensorOp_16816(manifest, args):
+def GenerateSM80_TensorOp_16816(manifest, cuda_version):
 
-  if not CudaToolkitVersionSatisfies(args.cuda_version, 11, 0):
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
     return
 
   layouts = [
@@ -1359,6 +1683,7 @@ def GenerateSM80_TensorOp_16816(manifest, args):
     tile_descriptions = [
       TileDescription([256, 128, 32],  3, [4, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([128, 256, 32],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 32],  3, [4, 1, 1], math_inst, min_cc, max_cc),
       TileDescription([256,  64, 32],  4, [4, 1, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64, 256, 32],  4, [1, 4, 1], math_inst, min_cc, max_cc),
       TileDescription([128, 128, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
@@ -1372,6 +1697,7 @@ def GenerateSM80_TensorOp_16816(manifest, args):
       TileDescription([256,  64, 64],  4, [4, 1, 1], math_inst, min_cc, max_cc_smem_limited),
       TileDescription([ 64, 256, 64],  4, [1, 4, 1], math_inst, min_cc, max_cc_smem_limited),
       TileDescription([128, 128, 64],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 64],  3, [4, 1, 1], math_inst, min_cc, max_cc_smem_limited),
       TileDescription([128,  64, 64],  3, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64, 128, 64],  3, [2, 2, 1], math_inst, min_cc, max_cc),
       TileDescription([ 64,  64, 64],  5, [2, 2, 1], math_inst, min_cc, max_cc),
@@ -1387,8 +1713,11 @@ def GenerateSM80_TensorOp_16816(manifest, args):
     CreateGemmOperator(manifest, layouts, tile_descriptions, \
       data_type, alignment_constraints)
 
+    CreateGemmGroupedOperator(manifest, layouts, tile_descriptions, data_type, alignment_constraints)
+
     conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
     CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, alignment_constraints)
+    CreateConv2dFixedChannelsOperator(manifest, conv_layout, tile_descriptions, data_type, [4, 8])
     CreateConv3dOperator(manifest, LayoutType.TensorNDHWC, tile_descriptions, data_type, 8)
 
     # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
@@ -1405,13 +1734,14 @@ def GenerateSM80_TensorOp_16816(manifest, args):
         data_type_mixed, alignment_constraints)
 
       CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type_mixed, alignment_constraints)
+      CreateConv2dFixedChannelsOperator(manifest, conv_layout, tile_descriptions, data_type_mixed, [4, 8])
       CreateConv3dOperator(manifest, LayoutType.TensorNDHWC, tile_descriptions, data_type_mixed, 8)
 #
 
 #
-def GenerateSM80_SparseTensorOp_16832(manifest, args):
+def GenerateSM80_SparseTensorOp_16832(manifest, cuda_version):
 
-  if not CudaToolkitVersionSatisfies(args.cuda_version, 11, 1):
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 1):
     return
 
   layouts = [
@@ -1488,9 +1818,9 @@ def GenerateSM80_SparseTensorOp_16832(manifest, args):
 #
 
 #
-def GenerateSM80_PlanarComplexTensorOp_16816(manifest, args):
+def GenerateSM80_PlanarComplexTensorOp_16816(manifest, cuda_version):
 
-  if not CudaToolkitVersionSatisfies(args.cuda_version, 11, 0):
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
     return
 
   layouts = [
@@ -1561,9 +1891,9 @@ def GenerateSM80_PlanarComplexTensorOp_16816(manifest, args):
         data_type_mixed, alignment_constraints, complex_transforms)
 
 #
-def GenerateSM80_TensorOp_16832_TN(manifest, args):
+def GenerateSM80_TensorOp_16832_TN(manifest, cuda_version):
 
-  if not CudaToolkitVersionSatisfies(args.cuda_version, 11, 0):
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
     return
 
   layouts = [
@@ -1613,7 +1943,7 @@ def GenerateSM80_TensorOp_16832_TN(manifest, args):
     data_type_mixed = [math_inst.element_a, math_inst.element_b, math_inst.element_a, DataType.f32]
   
     CreateGemmOperator(manifest, layouts, tile_descriptions, \
-      data_type, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
+      data_type, alignment_constraints, None, EpilogueFunctor.LinearCombination)
   
     operations = []
   
@@ -1622,7 +1952,7 @@ def GenerateSM80_TensorOp_16832_TN(manifest, args):
       
     conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
     CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
-      data_type, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
+      data_type, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombination)
     
     operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
       data_type_mixed, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
@@ -1636,9 +1966,9 @@ def GenerateSM80_TensorOp_16832_TN(manifest, args):
 #
 
 #
-def GenerateSM80_SparseTensorOp_16864_TN(manifest, args):
+def GenerateSM80_SparseTensorOp_16864_TN(manifest, cuda_version):
 
-  if not CudaToolkitVersionSatisfies(args.cuda_version, 11, 1):
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 1):
     return
 
   layouts = [
@@ -1677,7 +2007,7 @@ def GenerateSM80_SparseTensorOp_16864_TN(manifest, args):
   data_type_mixed = [DataType.s8, DataType.s8, DataType.s8, DataType.f32]
 
   CreateSparseGemmOperator(manifest, layouts, tile_descriptions, \
-    data_type, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
+    data_type, alignment_constraints, None, EpilogueFunctor.LinearCombination)
 
   operations = []
 
@@ -1692,9 +2022,9 @@ def GenerateSM80_SparseTensorOp_16864_TN(manifest, args):
 #
 
 #
-def GenerateSM80_TensorOp_16832_Interleaved(manifest, args):
+def GenerateSM80_TensorOp_16832_Interleaved(manifest, cuda_version):
 
-  if not CudaToolkitVersionSatisfies(args.cuda_version, 11, 0):
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
     return
 
   layouts = [
@@ -1746,9 +2076,9 @@ def GenerateSM80_TensorOp_16832_Interleaved(manifest, args):
 #
 
 #
-def GenerateSM80_TensorOp_16864_TN(manifest, args):
+def GenerateSM80_TensorOp_16864_TN(manifest, cuda_version):
 
-  if not CudaToolkitVersionSatisfies(args.cuda_version, 11, 0):
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
     return
 
   layouts = [
@@ -1799,7 +2129,7 @@ def GenerateSM80_TensorOp_16864_TN(manifest, args):
     data_type_mixed = [math_inst.element_a, math_inst.element_b, math_inst.element_a, DataType.f32]
   
     CreateGemmOperator(manifest, layouts, tile_descriptions, \
-      data_type, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
+      data_type, alignment_constraints, None, EpilogueFunctor.LinearCombination)
   
     operations = []
   
@@ -1808,24 +2138,24 @@ def GenerateSM80_TensorOp_16864_TN(manifest, args):
        
     conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
     CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
-      data_type, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
+      data_type, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombination)
     
     operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
       data_type_mixed, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
   
     for op in operations:
       if op.tile_description.threadblock_shape[1] >= 128:
-        op.C.alignment = 8
+        op.C.alignment = 16
       elif op.tile_description.threadblock_shape[1] == 64:
         op.C.alignment = 8
       else:
-        op.C.alignment = 4
+        op.C.alignment = 8
 #
 
 #
-def GenerateSM80_SparseTensorOp_168128_TN(manifest, args):
+def GenerateSM80_SparseTensorOp_168128_TN(manifest, cuda_version):
 
-  if not CudaToolkitVersionSatisfies(args.cuda_version, 11, 1):
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 1):
     return
 
   layouts = [
@@ -1864,7 +2194,7 @@ def GenerateSM80_SparseTensorOp_168128_TN(manifest, args):
   data_type_mixed = [DataType.s4, DataType.s4, DataType.s4, DataType.f32]
 
   CreateSparseGemmOperator(manifest, layouts, tile_descriptions, \
-    data_type, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
+    data_type, alignment_constraints, None, EpilogueFunctor.LinearCombination)
 
   operations = []
 
@@ -1872,18 +2202,16 @@ def GenerateSM80_SparseTensorOp_168128_TN(manifest, args):
     data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
 
   for op in operations:
-    if op.tile_description.threadblock_shape[1] >= 128:
-      op.C.alignment = 8 
-    elif op.tile_description.threadblock_shape[1] == 64:
-      op.C.alignment = 8
+    if op.tile_description.threadblock_shape[1] > 128:
+      op.C.alignment = 16
     else:
-      op.C.alignment = 4 
+      op.C.alignment = 8
 #
 
 #
-def GenerateSM80_TensorOp_16864_Interleaved(manifest, args):
+def GenerateSM80_TensorOp_16864_Interleaved(manifest, cuda_version):
 
-  if not CudaToolkitVersionSatisfies(args.cuda_version, 11, 0):
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
     return
 
   layouts = [
@@ -1934,9 +2262,9 @@ def GenerateSM80_TensorOp_16864_Interleaved(manifest, args):
 #
 
 #
-def GenerateSM80_TensorOp_168256(manifest, args):
+def GenerateSM80_TensorOp_168256(manifest, cuda_version):
 
-  if not CudaToolkitVersionSatisfies(args.cuda_version, 11, 0):
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
     return
 
   layouts = [
@@ -1986,9 +2314,9 @@ def GenerateSM80_TensorOp_168256(manifest, args):
 #
 
 #
-def GenerateSM80_TensorOp_1688(manifest, args):
+def GenerateSM80_TensorOp_1688(manifest, cuda_version):
 
-  if not CudaToolkitVersionSatisfies(args.cuda_version, 11, 0):
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
     return
 
   layouts = [
@@ -2056,15 +2384,15 @@ def GenerateSM80_TensorOp_1688(manifest, args):
       data_type_mixed, alignment_constraints)
 
     conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
-    CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, alignment_constraints)
 
+    CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, alignment_constraints)
     CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type_mixed, alignment_constraints)
 #
 
 #
-def GenerateSM80_TensorOp_1688_fast_math(manifest, args):
+def GenerateSM80_TensorOp_1688_fast_math(manifest, cuda_version):
 
-  if not CudaToolkitVersionSatisfies(args.cuda_version, 11, 0):
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
     return
 
   layouts = [
@@ -2131,9 +2459,9 @@ def GenerateSM80_TensorOp_1688_fast_math(manifest, args):
 #
 
 #
-def GenerateSM80_TensorOp_1688_fast_fp32_math(manifest, args):
+def GenerateSM80_TensorOp_1688_fast_fp32_math(manifest, cuda_version):
 
-  if not CudaToolkitVersionSatisfies(args.cuda_version, 11, 0):
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
     return
 
   layouts = [
@@ -2183,9 +2511,9 @@ def GenerateSM80_TensorOp_1688_fast_fp32_math(manifest, args):
     CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, alignment_constraints)
 #
 
-def GenerateSM80_TensorOp_1688_fast_fp32_math_complex(manifest, args):
+def GenerateSM80_TensorOp_1688_fast_fp32_math_complex(manifest, cuda_version):
 
-  if not CudaToolkitVersionSatisfies(args.cuda_version, 11, 0):
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
     return
 
   layouts = [
@@ -2231,9 +2559,9 @@ def GenerateSM80_TensorOp_1688_fast_fp32_math_complex(manifest, args):
 
 
 #
-def GenerateSM80_SparseTensorOp_16816_fast_math(manifest, args):
+def GenerateSM80_SparseTensorOp_16816_fast_math(manifest, cuda_version):
 
-  if not CudaToolkitVersionSatisfies(args.cuda_version, 11, 1):
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 1):
     return
 
   layouts = [
@@ -2281,9 +2609,9 @@ def GenerateSM80_SparseTensorOp_16816_fast_math(manifest, args):
 #
 
 #
-def GenerateSM80_TensorOp_1688_complex(manifest, args):
+def GenerateSM80_TensorOp_1688_complex(manifest, cuda_version):
 
-  if not CudaToolkitVersionSatisfies(args.cuda_version, 11, 0):
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
     return
 
   layouts = [
@@ -2330,53 +2658,423 @@ def GenerateSM80_TensorOp_1688_complex(manifest, args):
 #
 
 #
-def GenerateSM80_TensorOp_884(manifest, args):
+def GenerateSM80_TensorOp_1688_rank_k(manifest, cuda_version):
 
-  if not CudaToolkitVersionSatisfies(args.cuda_version, 11, 0):
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
     return
 
   layouts = [
-    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
-    (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
-    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
-    (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor),
   ]
 
-  math_inst =                                             \
-    MathInstruction(                                      \
-      [8, 8, 4],                                          \
-      DataType.f64, DataType.f64, DataType.f64,           \
+  fill_modes = [
+    FillMode.Lower, FillMode.Upper,
+  ]
+
+  math_instructions = [ 
+      MathInstruction(                                    \
+      [16, 8, 8],                                         \
+      DataType.tf32, DataType.tf32, DataType.f32,         \
       OpcodeClass.TensorOp,                               \
-      MathOperation.multiply_add)
+      MathOperation.multiply_add),
+      MathInstruction(                                    \
+      [16, 8, 8],                                         \
+      DataType.f32, DataType.f32, DataType.f32,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add_fast_f32),
+  ]
 
   min_cc = 80
   max_cc = 1024
-  max_cc_smem_limited = 80
-
-  alignment_constraints = [1,]
 
-  tile_descriptions = [
-    TileDescription([128, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
-    TileDescription([64, 128, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
-    TileDescription([128, 64, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
-    TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
-    TileDescription([64, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
-    TileDescription([32, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
-    TileDescription([32, 32, 16], 5, [2, 2, 1], math_inst, min_cc, max_cc),
-    TileDescription([16, 32, 16], 5, [1, 2, 1], math_inst, min_cc, max_cc),
-    TileDescription([32, 16, 16], 5, [2, 1, 1], math_inst, min_cc, max_cc),
-  ]
-
-  data_type = [DataType.f64, DataType.f64, DataType.f64, DataType.f64]
+  alignment_constraints = [1, 2, 4]  # Alignment only applies to A in SYRK
 
-  CreateGemmOperator(manifest, layouts, tile_descriptions, \
-    data_type, alignment_constraints)
-#
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([256, 128, 16],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 16],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      #TileDescription([256,  64, 16],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      #TileDescription([ 64, 256, 16],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 16],  5, [2, 2, 1], math_inst, min_cc, max_cc),
+      #TileDescription([128,  64, 16],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      #TileDescription([ 64, 128, 16],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      #TileDescription([ 64,  64, 16], 10, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256, 128, 32],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 32],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      #TileDescription([256,  64, 32],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      #TileDescription([ 64, 256, 32],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 32],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      #TileDescription([128,  64, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      #TileDescription([ 64, 128, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      #TileDescription([ 64,  64, 32],  5, [2, 2, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_type = [DataType.f32, DataType.f32, DataType.f32]
+
+    CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, \
+      data_type, alignment_constraints, BlasMode.symmetric)
+#
+
+#
+def GenerateSM80_TensorOp_1688_rank_k_complex(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor),
+  ]
+
+  fill_modes = [
+    FillMode.Lower, FillMode.Upper,
+  ]
+
+  math_instructions = [ 
+      MathInstruction(                                    \
+      [16, 8, 8],                                         \
+      DataType.tf32, DataType.tf32, DataType.f32,         \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add_complex),
+      MathInstruction(                                    \
+      [16, 8, 8],                                         \
+      DataType.f32, DataType.f32, DataType.f32,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add_complex_fast_f32),
+  ]
+
+  min_cc = 80
+  max_cc = 1024
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([128, 64, 16], 4, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([64, 128, 16], 4, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+      #TileDescription([64, 32, 16], 4, [2, 1, 1], math_inst, min_cc, max_cc),
+      #TileDescription([32, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_type = [
+      DataType.cf32, DataType.cf32, DataType.cf32
+    ]
+
+    alignment_constraints = [1,]
+
+    # SYRK
+    CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, \
+      data_type, alignment_constraints, BlasMode.symmetric)
+
+    # HERK
+    CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, \
+      data_type, alignment_constraints, BlasMode.hermitian)
+#
+
+#
+def GenerateSM80_TensorOp_1688_trmm(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+  ]
+
+  side_modes = [
+    SideMode.Left, SideMode.Right,
+  ]
+
+  fill_modes = [
+    FillMode.Lower, FillMode.Upper,
+  ]
+
+  diag_types = [
+    DiagType.NonUnit, DiagType.Unit,
+  ]
+
+  math_instructions = [ 
+      MathInstruction(                                    \
+      [16, 8, 8],                                         \
+      DataType.tf32, DataType.tf32, DataType.f32,         \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add),
+      MathInstruction(                                    \
+      [16, 8, 8],                                         \
+      DataType.f32, DataType.f32, DataType.f32,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add_fast_f32),
+  ]
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [1, 2, 4] 
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([256, 128, 16],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 16],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 16],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 16],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 16],  5, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 16],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      #TileDescription([ 64, 128, 16],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 16], 10, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256, 128, 32],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 32],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      #TileDescription([256,  64, 32],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      #TileDescription([ 64, 256, 32],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 32],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      #TileDescription([128,  64, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      #TileDescription([ 64, 128, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      #TileDescription([ 64,  64, 32],  5, [2, 2, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_type = [DataType.f32, DataType.f32, DataType.f32, DataType.f32]
+
+    CreateTrmmOperator(manifest, layouts, side_modes, fill_modes, diag_types, tile_descriptions, \
+      data_type, alignment_constraints)
+#
+
+#
+def GenerateSM80_TensorOp_1688_trmm_complex(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+  ]
+
+  side_modes = [
+    SideMode.Left, SideMode.Right,
+  ]
+
+  fill_modes = [
+    FillMode.Lower, FillMode.Upper,
+  ]
+
+  diag_types = [
+    DiagType.NonUnit, DiagType.Unit,
+  ]
+
+  math_instructions = [ 
+      MathInstruction(                                    \
+      [16, 8, 8],                                         \
+      DataType.tf32, DataType.tf32, DataType.f32,         \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add_complex),
+      MathInstruction(                                    \
+      [16, 8, 8],                                         \
+      DataType.f32, DataType.f32, DataType.f32,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add_complex_fast_f32),
+  ]
+
+  min_cc = 80
+  max_cc = 1024
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([128, 64, 16], 4, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([64, 128, 16], 4, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([64, 32, 16], 4, [2, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([32, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_type = [
+      DataType.cf32, DataType.cf32, DataType.cf32, DataType.cf32
+    ]
+
+    alignment_constraints = [1,]
+
+    complex_transforms = [
+      ComplexTransform.none, ComplexTransform.conj,
+    ]
+
+    CreateTrmmOperator(manifest, layouts, side_modes, fill_modes, diag_types, tile_descriptions, \
+      data_type, alignment_constraints, complex_transforms)
+#
+
+#
+def GenerateSM80_TensorOp_1688_symm(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  # A and B have same layouts
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+  ]
+
+  side_modes = [
+    SideMode.Left, SideMode.Right,
+  ]
+
+  fill_modes = [
+    FillMode.Lower, FillMode.Upper,
+  ]
+
+  math_instructions = [ 
+      MathInstruction(                                    \
+      [16, 8, 8],                                         \
+      DataType.tf32, DataType.tf32, DataType.f32,         \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add),
+      MathInstruction(                                    \
+      [16, 8, 8],                                         \
+      DataType.f32, DataType.f32, DataType.f32,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add_fast_f32),
+  ]
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [
+    1, 2, 4
+  ] 
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([256, 128, 16],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 16],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      #TileDescription([256,  64, 16],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      #TileDescription([ 64, 256, 16],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 16],  5, [2, 2, 1], math_inst, min_cc, max_cc),
+      #TileDescription([128,  64, 16],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      #TileDescription([ 64, 128, 16],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      #TileDescription([ 64,  64, 16], 10, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256, 128, 32],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 32],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      #TileDescription([256,  64, 32],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      #TileDescription([ 64, 256, 32],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 32],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      #TileDescription([128,  64, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      #TileDescription([ 64, 128, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      #TileDescription([ 64,  64, 32],  5, [2, 2, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_type = [DataType.f32, DataType.f32, DataType.f32, DataType.f32]
+
+    CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
+      data_type, alignment_constraints, BlasMode.symmetric)
+#
+
+#
+def GenerateSM80_TensorOp_1688_symm_complex(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+  ]
+
+  side_modes = [
+    SideMode.Left, SideMode.Right,
+  ]
+
+  fill_modes = [
+    FillMode.Lower, FillMode.Upper,
+  ]
+
+  math_instructions = [ 
+      MathInstruction(                                    \
+      [16, 8, 8],                                         \
+      DataType.tf32, DataType.tf32, DataType.f32,         \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add_complex),
+      MathInstruction(                                    \
+      [16, 8, 8],                                         \
+      DataType.f32, DataType.f32, DataType.f32,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add_complex_fast_f32),
+  ]
+
+  min_cc = 80
+  max_cc = 1024
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([128, 64, 16], 4, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([64, 128, 16], 4, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+      #TileDescription([64, 32, 16], 4, [2, 1, 1], math_inst, min_cc, max_cc),
+      #TileDescription([32, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_type = [
+      DataType.cf32, DataType.cf32, DataType.cf32, DataType.cf32
+    ]
+
+    alignment_constraints = [1,]
+
+    # SYMM
+    CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
+      data_type, alignment_constraints, BlasMode.symmetric)
+
+    # HEMM
+    CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
+      data_type, alignment_constraints, BlasMode.hermitian)
+#
+
+#
+def GenerateSM80_TensorOp_884(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+  ]
+
+  math_inst =                                             \
+    MathInstruction(                                      \
+      [8, 8, 4],                                          \
+      DataType.f64, DataType.f64, DataType.f64,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add)
+
+  min_cc = 80
+  max_cc = 1024
+  max_cc_smem_limited = 80
+
+  alignment_constraints = [1,]
+
+  tile_descriptions = [
+    TileDescription([128, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([256, 64, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 256, 16], 3, [2, 4, 1], math_inst, min_cc, max_cc),
+    TileDescription([256, 32, 16], 3, [4, 1, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 256, 16], 3, [1, 4, 1], math_inst, min_cc, max_cc),
+    TileDescription([128, 64, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 128, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 32, 16], 5, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([16, 32, 16], 5, [1, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 16, 16], 5, [2, 1, 1], math_inst, min_cc, max_cc),
+  ]
+
+  data_type = [DataType.f64, DataType.f64, DataType.f64, DataType.f64]
+
+  CreateGemmOperator(manifest, layouts, tile_descriptions, \
+    data_type, alignment_constraints)
+#
 
 #
-def GenerateSM80_TensorOp_884_complex(manifest, args):
+def GenerateSM80_TensorOp_884_complex(manifest, cuda_version):
 
-  if not CudaToolkitVersionSatisfies(args.cuda_version, 11, 0):
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
     return
 
   layouts = [
@@ -2430,9 +3128,9 @@ def GenerateSM80_TensorOp_884_complex(manifest, args):
     data_type, alignment_constraints, complex_transforms)
 
 #
-def GenerateSM80_TensorOp_884_complex_gaussian(manifest, args):
+def GenerateSM80_TensorOp_884_complex_gaussian(manifest, cuda_version):
 
-  if not CudaToolkitVersionSatisfies(args.cuda_version, 11, 0):
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
     return
 
   layouts = [
@@ -2476,10 +3174,459 @@ def GenerateSM80_TensorOp_884_complex_gaussian(manifest, args):
     data_type, alignment_constraints, complex_transforms)
 #
 
+#
+def GenerateSM80_TensorOp_884_rank_k(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor),
+  ]
+
+  fill_modes = [
+    FillMode.Lower, FillMode.Upper, 
+  ]
+
+  math_inst =                                             \
+    MathInstruction(                                      \
+      [8, 8, 4],                                          \
+      DataType.f64, DataType.f64, DataType.f64,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add)
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [1,]
+
+  tile_descriptions = [
+    TileDescription([128, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 128, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([128, 64, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 32, 16], 5, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([16, 32, 16], 5, [1, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 16, 16], 5, [2, 1, 1], math_inst, min_cc, max_cc),
+  ]
+
+  data_type = [DataType.f64, DataType.f64, DataType.f64]
+
+  CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, \
+    data_type, alignment_constraints, BlasMode.symmetric)
+#
+
+#
+def GenerateSM80_TensorOp_884_rank_k_complex(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor),
+  ]
+
+  fill_modes = [
+    FillMode.Lower, FillMode.Upper,
+  ]
+
+  math_inst =                                             \
+    MathInstruction(                                      \
+      [8, 8, 4],                                          \
+      DataType.f64, DataType.f64, DataType.f64,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add_complex)
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [1,]
+
+  tile_descriptions = [
+    TileDescription([128, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 128, 8], 3, [2, 4, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 64, 8], 3, [2, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([32, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([16, 32, 8], 4, [1, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([32, 16, 8], 4, [2, 1, 1], math_inst, min_cc, max_cc),
+  ]
+
+  data_type = [DataType.cf64, DataType.cf64, DataType.cf64]
+
+  # SYRK computation
+  CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, \
+    data_type, alignment_constraints, BlasMode.symmetric)
+
+  # HERK computation
+  CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, \
+    data_type, alignment_constraints, BlasMode.hermitian)
+
+#
+
+#
+def GenerateSM80_TensorOp_884_rank_k_complex_gaussian(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor),
+  ]
+
+  fill_modes = [
+    FillMode.Lower, FillMode.Upper,
+  ]
+
+  math_inst =                                             \
+    MathInstruction(                                      \
+      [8, 8, 4],                                          \
+      DataType.f64, DataType.f64, DataType.f64,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add_complex_gaussian)
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [1,]
+
+  tile_descriptions = [
+    TileDescription([64, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([32, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([16, 32, 8], 4, [1, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([32, 16, 8], 4, [2, 1, 1], math_inst, min_cc, max_cc),
+  ]
+
+  data_type = [DataType.cf64, DataType.cf64, DataType.cf64]
+
+  complex_transforms = [ComplexTransform.none,]
+
+  # SYRK computation
+  CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, \
+    data_type, alignment_constraints, BlasMode.symmetric)
+
+  # HERK computation
+  CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, \
+    data_type, alignment_constraints, BlasMode.hermitian)
+#
+
+#
+def GenerateSM80_TensorOp_884_trmm(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+  ]
+
+  side_modes = [
+    SideMode.Left, SideMode.Right,
+  ]
+
+  fill_modes = [
+    FillMode.Lower, FillMode.Upper,
+  ]
+
+  diag_types = [
+    DiagType.NonUnit, DiagType.Unit,
+  ]
+
+  math_inst =                                             \
+    MathInstruction(                                      \
+      [8, 8, 4],                                          \
+      DataType.f64, DataType.f64, DataType.f64,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add)
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [1,]
+
+  tile_descriptions = [
+    TileDescription([128, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 128, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([128, 64, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+  ]
+
+  data_type = [DataType.f64, DataType.f64, DataType.f64, DataType.f64]
+
+  CreateTrmmOperator(manifest, layouts, side_modes, fill_modes, diag_types, tile_descriptions, \
+    data_type, alignment_constraints)
+#
+
+#
+def GenerateSM80_TensorOp_884_trmm_complex(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+  ]
+
+  side_modes = [
+    SideMode.Left, SideMode.Right,
+  ]
+
+  fill_modes = [
+    FillMode.Lower, FillMode.Upper,
+  ]
+
+  diag_types = [
+    DiagType.NonUnit, DiagType.Unit,
+  ]
+
+  math_inst =                                             \
+    MathInstruction(                                      \
+      [8, 8, 4],                                          \
+      DataType.f64, DataType.f64, DataType.f64,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add_complex)
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [1,]
+
+  tile_descriptions = [
+    TileDescription([128, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 128, 8], 3, [2, 4, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 64, 8], 3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+  ]
+
+  data_type = [DataType.cf64, DataType.cf64, DataType.cf64, DataType.cf64]
+
+  complex_transforms = [
+    ComplexTransform.none, ComplexTransform.conj,
+  ]
+
+  CreateTrmmOperator(manifest, layouts, side_modes, fill_modes, diag_types, tile_descriptions, \
+    data_type, alignment_constraints, complex_transforms)
+#
+
+
+#
+def GenerateSM80_TensorOp_884_trmm_complex_gaussian(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+  ]
+
+  side_modes = [
+    SideMode.Left, SideMode.Right,
+  ]
+
+  fill_modes = [
+    FillMode.Lower, FillMode.Upper,
+  ]
+
+  diag_types = [
+    DiagType.NonUnit, DiagType.Unit,
+  ]
+
+  math_inst =                                             \
+    MathInstruction(                                      \
+      [8, 8, 4],                                          \
+      DataType.f64, DataType.f64, DataType.f64,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add_complex_gaussian)
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [1,]
+
+  tile_descriptions = [
+    TileDescription([64, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+  ]
+
+  data_type = [DataType.cf64, DataType.cf64, DataType.cf64, DataType.cf64]
+
+  complex_transforms = [
+    ComplexTransform.none, ComplexTransform.conj,
+  ]
+
+  CreateTrmmOperator(manifest, layouts, side_modes, fill_modes, diag_types, tile_descriptions, \
+    data_type, alignment_constraints, complex_transforms)
+#
+
+#
+def GenerateSM80_TensorOp_884_symm(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+  ]
+
+  side_modes = [
+    SideMode.Left, SideMode.Right,
+  ]
+
+  fill_modes = [
+    FillMode.Lower, FillMode.Upper, 
+  ]
+
+  math_inst =                                             \
+    MathInstruction(                                      \
+      [8, 8, 4],                                          \
+      DataType.f64, DataType.f64, DataType.f64,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add)
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [1,]
+
+  tile_descriptions = [
+    TileDescription([128, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 128, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([128, 64, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 32, 16], 5, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([16, 32, 16], 5, [1, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 16, 16], 5, [2, 1, 1], math_inst, min_cc, max_cc),
+  ]
+
+  data_type = [DataType.f64, DataType.f64, DataType.f64, DataType.f64]
+
+  CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
+    data_type, alignment_constraints, BlasMode.symmetric)
+#
+
+#
+def GenerateSM80_TensorOp_884_symm_complex(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+  ]
+
+  side_modes = [
+    SideMode.Left, SideMode.Right,
+  ]
+
+  fill_modes = [
+    FillMode.Lower, FillMode.Upper,
+  ]
+
+  math_inst =                                             \
+    MathInstruction(                                      \
+      [8, 8, 4],                                          \
+      DataType.f64, DataType.f64, DataType.f64,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add_complex)
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [1,]
+
+  tile_descriptions = [
+    TileDescription([128, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 128, 8], 3, [2, 4, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 64, 8], 3, [2, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([32, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([16, 32, 8], 4, [1, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([32, 16, 8], 4, [2, 1, 1], math_inst, min_cc, max_cc),
+  ]
+
+  data_type = [DataType.cf64, DataType.cf64, DataType.cf64, DataType.cf64]
+
+  # SYMM computation
+  CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
+    data_type, alignment_constraints, BlasMode.symmetric)
+
+  # HEMM computation
+  CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
+    data_type, alignment_constraints, BlasMode.hermitian)
+#
+
+#
+def GenerateSM80_TensorOp_884_symm_complex_gaussian(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+  ]
+
+  side_modes = [
+    SideMode.Left, SideMode.Right,
+  ]
+
+  fill_modes = [
+    FillMode.Lower, FillMode.Upper,
+  ]
+
+  math_inst =                                             \
+    MathInstruction(                                      \
+      [8, 8, 4],                                          \
+      DataType.f64, DataType.f64, DataType.f64,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add_complex_gaussian)
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [1,]
+
+  tile_descriptions = [
+    TileDescription([64, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([32, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([16, 32, 8], 4, [1, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([32, 16, 8], 4, [2, 1, 1], math_inst, min_cc, max_cc),
+  ]
+
+  data_type = [DataType.cf64, DataType.cf64, DataType.cf64, DataType.cf64]
+
+  complex_transforms = [ComplexTransform.none,]
+
+  # SYMM computation
+  CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
+    data_type, alignment_constraints, BlasMode.symmetric)
+
+  # HEMM computation
+  CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
+    data_type, alignment_constraints, BlasMode.hermitian)
+#
+
 ###################################################################################################
 
 #
-def GenerateSM80_Simt_f32(manifest, args):
+def GenerateSM80_Simt_f32(manifest, cuda_version):
   layouts = [
     (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
     (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
@@ -2531,7 +3678,7 @@ def GenerateSM80_Simt_f32(manifest, args):
 
 
 #
-def GenerateSM80_Simt_f64(manifest, args):
+def GenerateSM80_Simt_f64(manifest, cuda_version):
   layouts = [
     (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
     (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
@@ -2576,7 +3723,7 @@ def GenerateSM80_Simt_f64(manifest, args):
 
 ##################################################################################################
 #
-def GenerateSM80_Simt_complex(manifest, args):
+def GenerateSM80_Simt_complex(manifest, cuda_version):
   math_instructions = [
     MathInstruction(                                  \
       [1, 1, 1],                                      \
@@ -2633,30 +3780,45 @@ def GenerateSM80_Simt_complex(manifest, args):
 ###################################################################################################
 
 #
-def GenerateSM80(manifest, args):
-  GenerateSM80_TensorOp_16816(manifest, args)
-  GenerateSM80_SparseTensorOp_16832(manifest, args)
-  GenerateSM80_PlanarComplexTensorOp_16816(manifest, args)
-  GenerateSM80_TensorOp_1688(manifest, args)
-  GenerateSM80_TensorOp_1688_fast_math(manifest, args)
-  GenerateSM80_SparseTensorOp_16816_fast_math(manifest, args)
-  GenerateSM80_TensorOp_1688_complex(manifest, args)
+def GenerateSM80(manifest, cuda_version):
+  GenerateSM80_TensorOp_16816(manifest, cuda_version)
+  GenerateSM80_SparseTensorOp_16832(manifest, cuda_version)
+  GenerateSM80_PlanarComplexTensorOp_16816(manifest, cuda_version)
+  GenerateSM80_TensorOp_1688(manifest, cuda_version)
+  GenerateSM80_TensorOp_1688_fast_math(manifest, cuda_version)
+  GenerateSM80_SparseTensorOp_16816_fast_math(manifest, cuda_version)
+  GenerateSM80_TensorOp_1688_complex(manifest, cuda_version)
   # 3xTF32 
-  GenerateSM80_TensorOp_1688_fast_fp32_math(manifest, args)
-  GenerateSM80_TensorOp_1688_fast_fp32_math_complex(manifest, args)
-  GenerateSM80_TensorOp_884(manifest, args)
-  GenerateSM80_TensorOp_884_complex(manifest, args)
-  GenerateSM80_TensorOp_884_complex_gaussian(manifest, args)
-  GenerateSM80_TensorOp_16832_TN(manifest, args)
-  GenerateSM80_SparseTensorOp_16864_TN(manifest, args)
-  GenerateSM80_TensorOp_16832_Interleaved(manifest, args)
-  GenerateSM80_TensorOp_16864_TN(manifest, args)
-  GenerateSM80_SparseTensorOp_168128_TN(manifest, args)
-  GenerateSM80_TensorOp_16864_Interleaved(manifest, args)
-  GenerateSM80_TensorOp_168256(manifest, args)
-  GenerateSM80_Simt_f32(manifest, args)
-  GenerateSM80_Simt_f64(manifest, args)
-  GenerateSM80_Simt_complex(manifest, args)
+  GenerateSM80_TensorOp_1688_fast_fp32_math(manifest, cuda_version)
+  GenerateSM80_TensorOp_1688_fast_fp32_math_complex(manifest, cuda_version)
+  GenerateSM80_TensorOp_1688_rank_k(manifest, cuda_version)
+  GenerateSM80_TensorOp_1688_rank_k_complex(manifest, cuda_version)
+  GenerateSM80_TensorOp_1688_trmm(manifest, cuda_version)
+  GenerateSM80_TensorOp_1688_trmm_complex(manifest, cuda_version)
+  GenerateSM80_TensorOp_1688_symm(manifest, cuda_version)
+  GenerateSM80_TensorOp_1688_symm_complex(manifest, cuda_version)
+  GenerateSM80_TensorOp_884(manifest, cuda_version)
+  GenerateSM80_TensorOp_884_complex(manifest, cuda_version)
+  GenerateSM80_TensorOp_884_complex_gaussian(manifest, cuda_version)
+  GenerateSM80_TensorOp_884_rank_k(manifest, cuda_version)
+  GenerateSM80_TensorOp_884_rank_k_complex(manifest, cuda_version)
+  GenerateSM80_TensorOp_884_rank_k_complex_gaussian(manifest, cuda_version)
+  GenerateSM80_TensorOp_884_trmm(manifest, cuda_version)
+  GenerateSM80_TensorOp_884_trmm_complex(manifest, cuda_version)
+  GenerateSM80_TensorOp_884_trmm_complex_gaussian(manifest, cuda_version)
+  GenerateSM80_TensorOp_884_symm(manifest, cuda_version)
+  GenerateSM80_TensorOp_884_symm_complex(manifest, cuda_version)
+  GenerateSM80_TensorOp_884_symm_complex_gaussian(manifest, cuda_version)
+  GenerateSM80_TensorOp_16832_TN(manifest, cuda_version)
+  GenerateSM80_SparseTensorOp_16864_TN(manifest, cuda_version)
+  GenerateSM80_TensorOp_16832_Interleaved(manifest, cuda_version)
+  GenerateSM80_TensorOp_16864_TN(manifest, cuda_version)
+  GenerateSM80_SparseTensorOp_168128_TN(manifest, cuda_version)
+  GenerateSM80_TensorOp_16864_Interleaved(manifest, cuda_version)
+  GenerateSM80_TensorOp_168256(manifest, cuda_version)
+  GenerateSM80_Simt_f32(manifest, cuda_version)
+  GenerateSM80_Simt_f64(manifest, cuda_version)
+  GenerateSM80_Simt_complex(manifest, cuda_version)
 
 ###################################################################################################
 ###################################################################################################
@@ -2671,6 +3833,7 @@ def GenerateSM80(manifest, args):
   parser.add_argument("--architectures", default='53;60;61;70;75;80', help="Target compute architectures")
   parser.add_argument("--kernels", default='', help='Comma delimited list to filter kernels by name.')
   parser.add_argument("--ignore-kernels", default='', help='Comma delimited list of kernels to exclude from build.')
+  parser.add_argument("--filter-by-cc", default='True', type=str, help='If enabled, kernels whose comupte capability range is not satisfied by the build target are excluded.')
   parser.add_argument("--cuda-version", default="11.0.0", help="Semantic version string of CUDA Toolkit")
   parser.add_argument('--kernel-filter-file',   type=str, default=None, required=False, help='Full path of filter file')
   parser.add_argument('--selected-kernel-list',   type=str, default=None, required=False,
@@ -2681,12 +3844,12 @@ def GenerateSM80(manifest, args):
 
   manifest = Manifest(args)
 
-  GenerateSM50(manifest, args)
-  GenerateSM60(manifest, args)
-  GenerateSM61(manifest, args)
-  GenerateSM70(manifest, args)
-  GenerateSM75(manifest, args)
-  GenerateSM80(manifest, args)
+  GenerateSM50(manifest, args.cuda_version)
+  GenerateSM60(manifest, args.cuda_version)
+  GenerateSM61(manifest, args.cuda_version)
+  GenerateSM70(manifest, args.cuda_version)
+  GenerateSM75(manifest, args.cuda_version)
+  GenerateSM80(manifest, args.cuda_version)
   if 'library' in args.generator_target.split(','):
     manifest.emit(GeneratorTarget.Library)
 
diff --git a/tools/library/scripts/library.py b/tools/library/scripts/library.py
index 4504f55a..a6b78324 100644
--- a/tools/library/scripts/library.py
+++ b/tools/library/scripts/library.py
@@ -187,6 +187,17 @@ class DataType(enum.Enum):
 }
 
 ###################################################################################################
+#
+class BlasMode(enum.Enum):
+  symmetric = enum_auto()
+  hermitian = enum_auto()
+
+#
+BlasModeTag = {
+  BlasMode.symmetric: 'cutlass::BlasMode::kSymmetric',
+  BlasMode.hermitian: 'cutlass::BlasMode::kHermitian',
+}
+
 #
 class ComplexTransform(enum.Enum):
   none = enum_auto()
@@ -341,6 +352,64 @@ class LayoutType(enum.Enum):
 }
 
 ###################################################################################################
+
+#
+class SideMode(enum.Enum):
+  Left = enum_auto()
+  Right = enum_auto()
+
+#
+SideModeTag = {
+  SideMode.Left: 'cutlass::SideMode::kLeft',
+  SideMode.Right: 'cutlass::SideMode::kRight'
+}
+
+#
+ShortSideModeNames = {
+  SideMode.Left: 'ls',
+  SideMode.Right: 'rs'
+}
+
+###################################################################################################
+
+#
+class FillMode(enum.Enum):
+  Lower = enum_auto()
+  Upper = enum_auto()
+
+#
+FillModeTag = {
+  FillMode.Lower: 'cutlass::FillMode::kLower',
+  FillMode.Upper: 'cutlass::FillMode::kUpper'
+}
+
+#
+ShortFillModeNames = {
+  FillMode.Lower: 'l',
+  FillMode.Upper: 'u'
+}
+
+###################################################################################################
+
+#
+class DiagType(enum.Enum):
+  NonUnit = enum_auto()
+  Unit = enum_auto()
+
+#
+DiagTypeTag = {
+  DiagType.NonUnit: 'cutlass::DiagType::kNonUnit',
+  DiagType.Unit: 'cutlass::DiagType::kUnit'
+}
+
+#
+ShortDiagTypeNames = {
+  DiagType.NonUnit: 'nu',
+  DiagType.Unit: 'un'
+}
+
+###################################################################################################
+
 #
 class OpcodeClass(enum.Enum):
   Simt = enum_auto()
@@ -366,12 +435,20 @@ class OpcodeClass(enum.Enum):
 #
 class OperationKind(enum.Enum):
   Gemm = enum_auto()
+  RankK = enum_auto()
+  Rank2K = enum_auto()
+  Trmm = enum_auto()
+  Symm = enum_auto()
   Conv2d = enum_auto()        
   Conv3d = enum_auto()        
 
 #
 OperationKindNames = {
   OperationKind.Gemm: 'gemm'
+  , OperationKind.RankK: 'rank_k'
+  , OperationKind.Rank2K: 'rank_2k'
+  , OperationKind.Trmm: 'trmm'
+  , OperationKind.Symm: 'symm'
   , OperationKind.Conv2d: 'conv2d'  
   , OperationKind.Conv3d: 'conv3d' 
 }
@@ -414,6 +491,7 @@ class GemmKind(enum.Enum):
   Universal = enum_auto()
   PlanarComplex = enum_auto()
   PlanarComplexArray = enum_auto()
+  Grouped = enum_auto()
 
 #
 GemmKindNames = {
@@ -422,6 +500,34 @@ class GemmKind(enum.Enum):
   GemmKind.Universal: "gemm",
   GemmKind.PlanarComplex: "gemm_planar_complex",
   GemmKind.PlanarComplexArray: "gemm_planar_complex_array",
+  GemmKind.Grouped: "gemm_grouped"
+}
+
+#
+class RankKKind(enum.Enum):
+  Universal = enum_auto()
+
+#
+RankKKindNames = {
+  RankKKind.Universal: "rank_k"
+}
+
+#
+class TrmmKind(enum.Enum):
+  Universal = enum_auto()
+
+#
+TrmmKindNames = {
+  TrmmKind.Universal: "trmm"
+}
+
+#
+class SymmKind(enum.Enum):
+  Universal = enum_auto()
+
+#
+SymmKindNames = {
+  SymmKind.Universal: "symm"
 }
 
 #
@@ -483,16 +589,22 @@ class ConvKind(enum.Enum):
 class IteratorAlgorithm(enum.Enum):
   Analytic = enum_auto()
   Optimized = enum_auto()
+  FixedChannels = enum_auto()
+  FewChannels = enum_auto()
 
 #
 IteratorAlgorithmTag = {
   IteratorAlgorithm.Analytic: 'cutlass::conv::IteratorAlgorithm::kAnalytic',
   IteratorAlgorithm.Optimized: 'cutlass::conv::IteratorAlgorithm::kOptimized',
+  IteratorAlgorithm.FixedChannels: 'cutlass::conv::IteratorAlgorithm::kFixedChannels',
+  IteratorAlgorithm.FewChannels: 'cutlass::conv::IteratorAlgorithm::kFewChannels'
 }
 
 IteratorAlgorithmNames = {
   IteratorAlgorithm.Analytic: 'analytic',
   IteratorAlgorithm.Optimized: 'optimized',
+  IteratorAlgorithm.FixedChannels: 'fixed_channels',
+  IteratorAlgorithm.FewChannels: 'few_channels'
 }
 
 #
@@ -547,4 +659,25 @@ def __init__(self, element, layout, alignment = 1, complex_transform = ComplexTr
     self.alignment = alignment
     self.complex_transform = complex_transform
 
+#
+class SymmetricTensorDescription:
+  def __init__(self, element, layout, fill_mode, alignment = 1, complex_transform = ComplexTransform.none, side_mode = SideMode.Left):
+    self.element = element
+    self.layout = layout
+    self.fill_mode = fill_mode
+    self.alignment = alignment
+    self.complex_transform = complex_transform
+    self.side_mode = side_mode
+
+#
+class TriangularTensorDescription:
+  def __init__(self, element, layout, side_mode, fill_mode, diag_type, alignment = 1, complex_transform = ComplexTransform.none):
+    self.element = element
+    self.layout = layout
+    self.side_mode = side_mode
+    self.fill_mode = fill_mode
+    self.diag_type = diag_type
+    self.alignment = alignment
+    self.complex_transform = complex_transform
+
 ###################################################################################################
diff --git a/tools/library/scripts/manifest.py b/tools/library/scripts/manifest.py
index f11ce06d..408ab1f5 100644
--- a/tools/library/scripts/manifest.py
+++ b/tools/library/scripts/manifest.py
@@ -10,6 +10,10 @@
 
 from library import *
 from gemm_operation import *
+from rank_k_operation import *
+from rank_2k_operation import *
+from trmm_operation import *
+from symm_operation import *
 from conv2d_operation import *  
 from conv3d_operation import *  
 
@@ -24,6 +28,10 @@ def __init__(self, generated_path, kind, args):
       OperationKind.Gemm: EmitGemmConfigurationLibrary
       , OperationKind.Conv2d: EmitConv2dConfigurationLibrary  
       , OperationKind.Conv3d: EmitConv3dConfigurationLibrary  
+      , OperationKind.RankK: EmitRankKConfigurationLibrary
+      , OperationKind.Rank2K: EmitRank2KConfigurationLibrary
+      , OperationKind.Trmm: EmitTrmmConfigurationLibrary
+      , OperationKind.Symm: EmitSymmConfigurationLibrary
     }
 
     self.configurations = [];
@@ -177,44 +185,57 @@ def __init__(self):
 class Manifest:
 
   #
-  def __init__(self, args):
+  def __init__(self, args = None):
     self.operations = {}
     self.args = args
-
-    architectures = args.architectures.split(';') if len(args.architectures) else ['50',]
-    self.compute_capabilities = [int(x) for x in architectures]
-    
-    self.selected_kernels = []
-    
-    if args.operations == 'all':
-      self.operations_enabled = []
-    else:
-
-      operations_list = [
-        OperationKind.Gemm
-        , OperationKind.Conv2d    
-        , OperationKind.Conv3d    
-      ] 
-
-      self.operations_enabled = [x for x in operations_list if OperationKindNames[x] in args.operations.split(',')]
-
-    if args.kernels == 'all':
-      self.kernel_names = []
-    else:
-      self.kernel_names = [x for x in args.kernels.split(',') if x != '']
-
-    self.ignore_kernel_names = [x for x in args.ignore_kernels.split(',') if x != '']
-
-    if args.kernel_filter_file is None:
-        self.kernel_filter_list = []
-    else:
-        self.kernel_filter_list = self.get_kernel_filters(args.kernel_filter_file)
-
-
     self.operation_count = 0
     self.operations_by_name = {}
+    
+    self.kernel_filter = ''
+    self.kernel_filter_list = []
+    self.kernel_names = []
+    self.operations_enabled = []
+    self.selected_kernels = []
+    self.ignore_kernel_names = []
+    self.compute_capabilities = [50,]
+    self.curr_build_dir = '.'
+    self.filter_by_cc = True
+
+    if self.args:
+      self.kernel_filter = self.args.kernels
+      self.curr_build_dir = args.curr_build_dir
+      architectures = args.architectures.split(';') if len(args.architectures) else ['50',]
+      self.compute_capabilities = [int(x) for x in architectures]
+      
+      if args.filter_by_cc in ['false', 'False', '0']:
+        self.filter_by_cc = False
+
+      if args.operations == 'all':
+        self.operations_enabled = []
+      else:
+        operations_list = [
+          OperationKind.Gemm
+          , OperationKind.Conv2d    
+          , OperationKind.Conv3d    
+          , OperationKind.RankK
+          , OperationKind.Trmm
+          , OperationKind.Symm
+        ] 
+        self.operations_enabled = [x for x in operations_list if OperationKindNames[x] in args.operations.split(',')]
+
+      if args.kernels == 'all':
+        self.kernel_names = []
+      else:
+        self.kernel_names = [x for x in args.kernels.split(',') if x != '']
+
+      self.ignore_kernel_names = [x for x in args.ignore_kernels.split(',') if x != '']
+
+      if args.kernel_filter_file is None:
+          self.kernel_filter_list = []
+      else:
+          self.kernel_filter_list = self.get_kernel_filters(args.kernel_filter_file)
 
-
+  #
   def get_kernel_filters (self, kernelListFile):
     if os.path.isfile(kernelListFile):
         with open(kernelListFile, 'r') as fileReader:
@@ -225,8 +246,7 @@ def get_kernel_filters (self, kernelListFile):
     else:
         return []
 
-
-
+  #
   def filter_out_kernels(self, kernel_name, kernel_filter_list):
 
     for kernel_filter_re in kernel_filter_list:
@@ -252,7 +272,8 @@ def filter(self, operation):
     ''' Filtering operations based on various criteria'''
 
     # filter based on compute capability
-    enabled = False
+    enabled = not (self.filter_by_cc)
+
     for cc in self.compute_capabilities:
       if cc >= operation.tile_description.minimum_compute_capability and \
         cc <= operation.tile_description.maximum_compute_capability:
@@ -292,7 +313,6 @@ def filter(self, operation):
         if self.filter_out_kernels(operation.procedural_name(), self.kernel_filter_list):
             enabled = True
 
-
     # todo: filter based on compute data type
     return enabled
   #
@@ -334,7 +354,7 @@ def emit(self, target = GeneratorTarget.Library):
       GeneratorTarget.Library: EmitInterfaceLibrary 
     }
 
-    generated_path = os.path.join(self.args.curr_build_dir, 'generated')
+    generated_path = os.path.join(self.curr_build_dir, 'generated')
 
     # create generated/
     if os.path.exists(generated_path):
diff --git a/tools/library/scripts/rank_2k_operation.py b/tools/library/scripts/rank_2k_operation.py
new file mode 100644
index 00000000..ebb7c3eb
--- /dev/null
+++ b/tools/library/scripts/rank_2k_operation.py
@@ -0,0 +1,398 @@
+#
+# \file generator.py
+#
+# \brief Generates the CUTLASS Library's instances
+#
+# 
+
+import enum
+import os.path
+import shutil
+import functools
+import operator
+
+from library import *
+
+
+###################################################################################################
+#
+# Data structure modeling a Rank K update operation
+#
+###################################################################################################
+
+#
+class Rank2KOperation:
+  #
+  def __init__(self, rank_k_kind, arch, tile_description, A, C, element_epilogue, \
+      epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity8, \
+      blas_mode = BlasMode.symmetric):
+
+    self.blas_mode = blas_mode
+    self.operation_kind = OperationKind.Rank2K
+    self.arch = arch
+    self.tile_description = tile_description
+    self.rank_k_kind = rank_k_kind
+    # tensor A and B have same data type and layout
+    self.A = A
+    self.B = A  
+    self.C = C
+    self.element_epilogue = element_epilogue
+    self.epilogue_functor = epilogue_functor
+    self.swizzling_functor = swizzling_functor
+
+  #
+  def is_complex(self):
+    complex_operators = [
+      MathOperation.multiply_add_complex, 
+      MathOperation.multiply_add_complex_gaussian,
+      MathOperation.multiply_add_complex_fast_f32
+    ]
+    return self.tile_description.math_instruction.math_operation in complex_operators
+    return False
+
+  #
+  def is_planar_complex(self):
+    return False
+
+  #
+  def accumulator_type(self):
+    accum = self.tile_description.math_instruction.element_accumulator
+
+    if self.is_complex():
+      return get_complex_from_real(accum)
+
+    return accum
+
+  #
+  def short_math_name(self):
+    if self.tile_description.math_instruction.math_operation == MathOperation.multiply_add_complex_gaussian:
+      return "g%s" % ShortDataTypeNames[self.accumulator_type()]
+    return ShortDataTypeNames[self.accumulator_type()]
+
+
+  #
+  def core_name(self):
+    ''' The basic operation kind is prefixed with a letter indicating the accumulation type. '''
+    
+    inst_shape = ''
+    inst_operation = ''
+    intermediate_type = ''
+
+    math_operations_map = {
+      MathOperation.xor_popc: 'xor',
+    }
+
+    if self.tile_description.math_instruction.opcode_class == OpcodeClass.TensorOp or \
+      self.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp:
+
+      math_op = self.tile_description.math_instruction.math_operation
+      math_op_string = math_operations_map[math_op] if math_op in math_operations_map.keys() else ''
+
+      inst_shape = "%d%d%d" % tuple(self.tile_description.math_instruction.instruction_shape)
+      inst_shape += math_op_string
+
+      if self.tile_description.math_instruction.element_a != self.A.element and \
+        self.tile_description.math_instruction.element_a != self.tile_description.math_instruction.element_accumulator:
+        intermediate_type = DataTypeNames[self.tile_description.math_instruction.element_a]
+
+    operation_name = 'syr2k' if self.blas_mode == BlasMode.symmetric else 'her2k'
+
+    return "%s%s%s%s" % (self.short_math_name(), inst_shape, intermediate_type, operation_name)
+
+  #
+  def extended_name(self):
+    ''' Append data types if they differ from compute type. '''
+    if self.is_complex():
+      extended_name = "${core_name}"
+    else:
+      if self.C.element != self.tile_description.math_instruction.element_accumulator and \
+        self.A.element != self.tile_description.math_instruction.element_accumulator:
+        extended_name = "${element_c}_${core_name}_${element_a}"
+      elif self.C.element == self.tile_description.math_instruction.element_accumulator and  \
+        self.A.element != self.tile_description.math_instruction.element_accumulator:
+        extended_name = "${core_name}_${element_a}"
+      else:
+        extended_name = "${core_name}"
+
+    extended_name = SubstituteTemplate(extended_name, {
+      'element_a': DataTypeNames[self.A.element],
+      'element_c': DataTypeNames[self.C.element],
+      'core_name': self.core_name()
+      })
+
+    return extended_name
+
+  #
+  def layout_name(self):
+    if self.is_complex() or self.is_planar_complex():
+      return "%s" % (
+        ShortComplexLayoutNames[(self.A.layout, self.A.complex_transform)] 
+      )
+    return "%s" % (ShortLayoutTypeNames[self.A.layout])
+
+  #
+  def fill_mode_name(self):
+    return "%s" % (ShortFillModeNames[self.C.fill_mode])
+
+  #
+  def procedural_name(self):
+    ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
+    threadblock = self.tile_description.procedural_name()
+
+    opcode_class_name = OpcodeClassNames[self.tile_description.math_instruction.opcode_class]
+
+    alignment = max([self.A.alignment, self.C.alignment])
+
+    return SubstituteTemplate(
+      "cutlass_${opcode_class}_${extended_name}_${threadblock}_${layout}_${fill_mode}_align${alignment}",
+      {
+        'opcode_class': opcode_class_name,
+        'extended_name': self.extended_name(),
+        'threadblock': threadblock,
+        'layout': self.layout_name(),
+        'fill_mode': self.fill_mode_name(),
+        'alignment': "%d" % self.A.alignment,
+      }
+    )
+
+  #
+  def configuration_name(self):
+    ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
+    return self.procedural_name()
+
+###################################################################################################
+#
+# Emits single instances of a CUTLASS device-wide operator
+#
+###################################################################################################
+
+#
+class EmitRank2KUniversalInstance:
+  ''' Responsible for emitting a CUTLASS template definition'''
+
+  def __init__(self):
+    self.rank_k_template = """
+// Rank K operator ${operation_name}
+using Operation_${operation_name} = 
+  typename cutlass::gemm::device::Rank2K<
+    ${element_a}, ${layout_a}, 
+    ${element_b}, ${layout_b}, 
+    ${element_c}, ${layout_c}, ${fill_mode},
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${epilogue_functor}<
+      ${element_c},
+      ${epilogue_vector_length},
+      ${element_accumulator},
+      ${element_epilogue}
+    >,
+    ${swizzling_functor},
+    ${stages},
+    ${align_a},
+    ${align_b},
+    ${split_k_serial},
+    ${math_operation}
+>;
+"""
+    self.rank_k_complex_template = """
+// Rank K operator ${operation_name}
+using Operation_${operation_name} = 
+  typename cutlass::gemm::device::Rank2K<
+    ${element_a}, ${layout_a}, 
+    ${element_b}, ${layout_b}, 
+    ${element_c}, ${layout_c}, ${fill_mode},
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${epilogue_functor}<
+      ${element_c},
+      ${epilogue_vector_length},
+      ${element_accumulator},
+      ${element_epilogue}
+    >,
+    ${swizzling_functor},
+    ${stages},
+    ${align_a},
+    ${align_b},
+    ${split_k_serial},
+    ${math_operation},
+    ${transform_a},
+    ${transform_b},
+    ${blas_mode}
+>;
+"""
+
+  def emit(self, operation):
+
+    threadblock_shape = operation.tile_description.threadblock_shape
+    
+    warp_count = operation.tile_description.warp_count
+    warp_shape = [threadblock_shape[idx] // warp_count[idx] for idx in range(3)]
+
+    epilogue_vector_length = int(min(operation.C.alignment * DataTypeSize[operation.C.element], 128) / DataTypeSize[operation.C.element])
+
+    values = {
+      'operation_name': operation.procedural_name(),
+      'element_a': DataTypeTag[operation.A.element],
+      'layout_a': LayoutTag[operation.A.layout],
+      'element_b': DataTypeTag[operation.B.element],
+      'layout_b': LayoutTag[operation.B.layout],
+      'element_c': DataTypeTag[operation.C.element],
+      'layout_c': LayoutTag[operation.C.layout],
+      'fill_mode': FillModeTag[operation.C.fill_mode],
+      'element_accumulator': DataTypeTag[operation.accumulator_type()],
+      'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
+      'arch': "cutlass::arch::Sm%d" % operation.arch,
+      'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]),
+      'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]),
+      'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]),
+      'warp_shape_m': str(warp_shape[0]),
+      'warp_shape_n': str(warp_shape[1]),
+      'warp_shape_k': str(warp_shape[2]),
+      'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]),
+      'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]),
+      'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]),
+      'epilogue_vector_length': str(epilogue_vector_length),
+      'element_epilogue': str(DataTypeTag[operation.element_epilogue]),
+      'epilogue_functor': EpilogueFunctorTag[operation.epilogue_functor],
+      'swizzling_functor': SwizzlingFunctorTag[operation.swizzling_functor],
+      'stages': str(operation.tile_description.stages),
+      'align_a': str(operation.A.alignment),
+      'align_b': str(operation.B.alignment),
+      'split_k_serial': 'false', 
+      'math_operation': MathOperationTag[operation.tile_description.math_instruction.math_operation],
+      'transform_a': ComplexTransformTag[operation.A.complex_transform],
+      'transform_b': ComplexTransformTag[operation.B.complex_transform],
+      'blas_mode': BlasModeTag[operation.blas_mode]
+    }
+
+    rank_k_template = self.rank_k_complex_template if operation.is_complex() else self.rank_k_template
+
+    return SubstituteTemplate(rank_k_template, values)
+
+###################################################################################################
+
+
+###################################################################################################
+#
+# Emitters functions for all targets
+#
+###################################################################################################
+
+class EmitRank2KConfigurationLibrary:
+  def __init__(self, operation_path, configuration_name):
+    self.configuration_name = configuration_name
+    self.configuration_path = os.path.join(operation_path, "%s.cu" % configuration_name).replace('\\', '/')
+
+    self.instance_emitter = {
+      RankKKind.Universal: EmitRank2KUniversalInstance,
+    }
+
+    self.rank_k_kind_wrappers = {
+      RankKKind.Universal: 'Rank2KOperation',
+    }
+
+    self.instance_template = {
+      RankKKind.Universal: """
+${compile_guard_start}
+  manifest.append(new ${rank_k_kind}<
+    Operation_${operation_name}
+  >("${operation_name}"));
+${compile_guard_end}
+"""
+    }
+
+    self.header_template = """
+/*
+  Generated by rank_2k_operation.py - Do not edit.
+*/
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+#include "cutlass/cutlass.h"
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+
+#include "library_internal.h"
+#include "rank_2k_operation.h"
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+"""
+
+    self.initialize_function_template = """
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+void initialize_${configuration_name}(Manifest &manifest) {
+
+"""
+    self.epilogue_template = """
+
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+"""
+
+  def __enter__(self):
+    self.configuration_file = open(self.configuration_path, "w")
+    self.configuration_file.write(self.header_template)
+
+    self.instance_definitions = []
+    self.instance_wrappers = []
+
+    self.operations = []
+    return self
+
+  def emit(self, operation):
+    emitter = self.instance_emitter[operation.rank_k_kind]()
+
+    self.operations.append(operation)
+
+    self.instance_definitions.append(emitter.emit(operation))
+
+    self.instance_wrappers.append(SubstituteTemplate(self.instance_template[operation.rank_k_kind], {
+      'configuration_name': self.configuration_name,
+      'operation_name': operation.procedural_name(),
+      'rank_k_kind': self.rank_k_kind_wrappers[operation.rank_k_kind],
+      'compile_guard_start': SubstituteTemplate(self.wmma_guard_start, {'sm_number': str(operation.arch)}) \
+        if operation.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp else "",
+      'compile_guard_end': "#endif" \
+        if operation.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp else "" 
+      }))
+
+  def __exit__(self, exception_type, exception_value, traceback):
+
+    # Write instance definitions in top-level namespace
+    for instance_definition in self.instance_definitions:
+      self.configuration_file.write(instance_definition)
+
+    # Add wrapper objects within initialize() function
+    self.configuration_file.write(SubstituteTemplate(self.initialize_function_template, {
+      'configuration_name': self.configuration_name
+      }))
+   
+    for instance_wrapper in self.instance_wrappers:
+      self.configuration_file.write(instance_wrapper) 
+
+    self.configuration_file.write(self.epilogue_template)
+    self.configuration_file.close()
+
+###################################################################################################
diff --git a/tools/library/scripts/rank_k_operation.py b/tools/library/scripts/rank_k_operation.py
new file mode 100644
index 00000000..5ff596be
--- /dev/null
+++ b/tools/library/scripts/rank_k_operation.py
@@ -0,0 +1,387 @@
+#
+# \file generator.py
+#
+# \brief Generates the CUTLASS Library's instances
+#
+# 
+
+import enum
+import os.path
+import shutil
+import functools
+import operator
+
+from library import *
+
+
+###################################################################################################
+#
+# Data structure modeling a Rank K update operation
+#
+###################################################################################################
+
+#
+class RankKOperation:
+  #
+  def __init__(self, rank_k_kind, arch, tile_description, A, C, element_epilogue, \
+      epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity8, \
+      blas_mode = BlasMode.symmetric):
+
+    self.blas_mode = blas_mode
+    self.operation_kind = OperationKind.RankK
+    self.arch = arch
+    self.tile_description = tile_description
+    self.rank_k_kind = rank_k_kind
+    self.A = A
+    self.C = C
+    self.element_epilogue = element_epilogue
+    self.epilogue_functor = epilogue_functor
+    self.swizzling_functor = swizzling_functor
+
+  #
+  def is_complex(self):
+    complex_operators = [
+      MathOperation.multiply_add_complex, 
+      MathOperation.multiply_add_complex_gaussian,
+      MathOperation.multiply_add_complex_fast_f32
+    ]
+    return self.tile_description.math_instruction.math_operation in complex_operators
+    return False
+
+  #
+  def is_planar_complex(self):
+    return False
+
+  #
+  def accumulator_type(self):
+    accum = self.tile_description.math_instruction.element_accumulator
+
+    if self.is_complex():
+      return get_complex_from_real(accum)
+
+    return accum
+
+  #
+  def short_math_name(self):
+    if self.tile_description.math_instruction.math_operation == MathOperation.multiply_add_complex_gaussian:
+      return "g%s" % ShortDataTypeNames[self.accumulator_type()]
+    return ShortDataTypeNames[self.accumulator_type()]
+
+
+  #
+  def core_name(self):
+    ''' The basic operation kind is prefixed with a letter indicating the accumulation type. '''
+    
+    inst_shape = ''
+    inst_operation = ''
+    intermediate_type = ''
+
+    math_operations_map = {
+      MathOperation.xor_popc: 'xor',
+    }
+
+    if self.tile_description.math_instruction.opcode_class == OpcodeClass.TensorOp or \
+      self.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp:
+
+      math_op = self.tile_description.math_instruction.math_operation
+      math_op_string = math_operations_map[math_op] if math_op in math_operations_map.keys() else ''
+
+      inst_shape = "%d%d%d" % tuple(self.tile_description.math_instruction.instruction_shape)
+      inst_shape += math_op_string
+
+      if self.tile_description.math_instruction.element_a != self.A.element and \
+        self.tile_description.math_instruction.element_a != self.tile_description.math_instruction.element_accumulator:
+        intermediate_type = DataTypeNames[self.tile_description.math_instruction.element_a]
+
+    operation_name = 'syrk' if self.blas_mode == BlasMode.symmetric else 'herk'
+
+    return "%s%s%s%s" % (self.short_math_name(), inst_shape, intermediate_type, operation_name)
+
+  #
+  def extended_name(self):
+    ''' Append data types if they differ from compute type. '''
+    if self.is_complex():
+      extended_name = "${core_name}"
+    else:
+      if self.C.element != self.tile_description.math_instruction.element_accumulator and \
+        self.A.element != self.tile_description.math_instruction.element_accumulator:
+        extended_name = "${element_c}_${core_name}_${element_a}"
+      elif self.C.element == self.tile_description.math_instruction.element_accumulator and  \
+        self.A.element != self.tile_description.math_instruction.element_accumulator:
+        extended_name = "${core_name}_${element_a}"
+      else:
+        extended_name = "${core_name}"
+
+    extended_name = SubstituteTemplate(extended_name, {
+      'element_a': DataTypeNames[self.A.element],
+      'element_c': DataTypeNames[self.C.element],
+      'core_name': self.core_name()
+      })
+
+    return extended_name
+
+  #
+  def layout_name(self):
+    if self.is_complex() or self.is_planar_complex():
+      return "%s" % (
+        ShortComplexLayoutNames[(self.A.layout, self.A.complex_transform)] 
+      )
+    return "%s" % (ShortLayoutTypeNames[self.A.layout])
+
+  #
+  def fill_mode_name(self):
+    return "%s" % (ShortFillModeNames[self.C.fill_mode])
+
+  #
+  def procedural_name(self):
+    ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
+    threadblock = self.tile_description.procedural_name()
+
+    opcode_class_name = OpcodeClassNames[self.tile_description.math_instruction.opcode_class]
+
+    alignment = max([self.A.alignment, self.C.alignment])
+
+    return SubstituteTemplate(
+      "cutlass_${opcode_class}_${extended_name}_${threadblock}_${layout}_${fill_mode}_align${alignment}",
+      {
+        'opcode_class': opcode_class_name,
+        'extended_name': self.extended_name(),
+        'threadblock': threadblock,
+        'layout': self.layout_name(),
+        'fill_mode': self.fill_mode_name(),
+        'alignment': "%d" % self.A.alignment,
+      }
+    )
+
+  #
+  def configuration_name(self):
+    ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
+    return self.procedural_name()
+
+###################################################################################################
+#
+# Emits single instances of a CUTLASS device-wide operator
+#
+###################################################################################################
+
+#
+class EmitRankKUniversalInstance:
+  ''' Responsible for emitting a CUTLASS template definition'''
+
+  def __init__(self):
+    self.rank_k_template = """
+// Rank K operator ${operation_name}
+using Operation_${operation_name} = 
+  typename cutlass::gemm::device::RankK<
+    ${element_a}, ${layout_a}, 
+    ${element_c}, ${layout_c}, ${fill_mode},
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${epilogue_functor}<
+      ${element_c},
+      ${epilogue_vector_length},
+      ${element_accumulator},
+      ${element_epilogue}
+    >,
+    ${swizzling_functor},
+    ${stages},
+    ${align_a},
+    ${split_k_serial},
+    ${math_operation}
+>;
+"""
+    self.rank_k_complex_template = """
+// Rank K operator ${operation_name}
+using Operation_${operation_name} = 
+  typename cutlass::gemm::device::RankK<
+    ${element_a}, ${layout_a}, 
+    ${element_c}, ${layout_c}, ${fill_mode},
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${epilogue_functor}<
+      ${element_c},
+      ${epilogue_vector_length},
+      ${element_accumulator},
+      ${element_epilogue}
+    >,
+    ${swizzling_functor},
+    ${stages},
+    ${align_a},
+    ${split_k_serial},
+    ${math_operation},
+    ${transform_a},
+    ${blas_mode}
+>;
+"""
+
+  def emit(self, operation):
+
+    threadblock_shape = operation.tile_description.threadblock_shape
+    
+    warp_count = operation.tile_description.warp_count
+    warp_shape = [threadblock_shape[idx] // warp_count[idx] for idx in range(3)]
+
+    epilogue_vector_length = int(min(operation.C.alignment * DataTypeSize[operation.C.element], 128) / DataTypeSize[operation.C.element])
+
+    values = {
+      'operation_name': operation.procedural_name(),
+      'element_a': DataTypeTag[operation.A.element],
+      'layout_a': LayoutTag[operation.A.layout],
+      'element_c': DataTypeTag[operation.C.element],
+      'layout_c': LayoutTag[operation.C.layout],
+      'fill_mode': FillModeTag[operation.C.fill_mode],
+      'element_accumulator': DataTypeTag[operation.accumulator_type()],
+      'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
+      'arch': "cutlass::arch::Sm%d" % operation.arch,
+      'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]),
+      'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]),
+      'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]),
+      'warp_shape_m': str(warp_shape[0]),
+      'warp_shape_n': str(warp_shape[1]),
+      'warp_shape_k': str(warp_shape[2]),
+      'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]),
+      'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]),
+      'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]),
+      'epilogue_vector_length': str(epilogue_vector_length),
+      'element_epilogue': str(DataTypeTag[operation.element_epilogue]),
+      'epilogue_functor': EpilogueFunctorTag[operation.epilogue_functor],
+      'swizzling_functor': SwizzlingFunctorTag[operation.swizzling_functor],
+      'stages': str(operation.tile_description.stages),
+      'align_a': str(operation.A.alignment),
+      'split_k_serial': 'false', 
+      'math_operation': MathOperationTag[operation.tile_description.math_instruction.math_operation],
+      'transform_a': ComplexTransformTag[operation.A.complex_transform],
+      'blas_mode': BlasModeTag[operation.blas_mode]
+    }
+
+    rank_k_template = self.rank_k_complex_template if operation.is_complex() else self.rank_k_template
+
+    return SubstituteTemplate(rank_k_template, values)
+
+###################################################################################################
+
+
+###################################################################################################
+#
+# Emitters functions for all targets
+#
+###################################################################################################
+
+class EmitRankKConfigurationLibrary:
+  def __init__(self, operation_path, configuration_name):
+    self.configuration_name = configuration_name
+    self.configuration_path = os.path.join(operation_path, "%s.cu" % configuration_name).replace('\\', '/')
+
+    self.instance_emitter = {
+      RankKKind.Universal: EmitRankKUniversalInstance,
+    }
+
+    self.rank_k_kind_wrappers = {
+      RankKKind.Universal: 'RankKOperation',
+    }
+
+    self.instance_template = {
+      RankKKind.Universal: """
+${compile_guard_start}
+  manifest.append(new ${rank_k_kind}<
+    Operation_${operation_name}
+  >("${operation_name}"));
+${compile_guard_end}
+"""
+    }
+
+    self.header_template = """
+/*
+  Generated by rank_k_operation.py - Do not edit.
+*/
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+#include "cutlass/cutlass.h"
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+
+#include "library_internal.h"
+#include "rank_k_operation.h"
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+"""
+
+    self.initialize_function_template = """
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+void initialize_${configuration_name}(Manifest &manifest) {
+
+"""
+    self.epilogue_template = """
+
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+"""
+
+  def __enter__(self):
+    self.configuration_file = open(self.configuration_path, "w")
+    self.configuration_file.write(self.header_template)
+
+    self.instance_definitions = []
+    self.instance_wrappers = []
+
+    self.operations = []
+    return self
+
+  def emit(self, operation):
+    emitter = self.instance_emitter[operation.rank_k_kind]()
+
+    self.operations.append(operation)
+
+    self.instance_definitions.append(emitter.emit(operation))
+
+    self.instance_wrappers.append(SubstituteTemplate(self.instance_template[operation.rank_k_kind], {
+      'configuration_name': self.configuration_name,
+      'operation_name': operation.procedural_name(),
+      'rank_k_kind': self.rank_k_kind_wrappers[operation.rank_k_kind],
+      'compile_guard_start': SubstituteTemplate(self.wmma_guard_start, {'sm_number': str(operation.arch)}) \
+        if operation.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp else "",
+      'compile_guard_end': "#endif" \
+        if operation.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp else "" 
+      }))
+
+  def __exit__(self, exception_type, exception_value, traceback):
+
+    # Write instance definitions in top-level namespace
+    for instance_definition in self.instance_definitions:
+      self.configuration_file.write(instance_definition)
+
+    # Add wrapper objects within initialize() function
+    self.configuration_file.write(SubstituteTemplate(self.initialize_function_template, {
+      'configuration_name': self.configuration_name
+      }))
+   
+    for instance_wrapper in self.instance_wrappers:
+      self.configuration_file.write(instance_wrapper) 
+
+    self.configuration_file.write(self.epilogue_template)
+    self.configuration_file.close()
+
+###################################################################################################
diff --git a/tools/library/scripts/rt.py b/tools/library/scripts/rt.py
new file mode 100644
index 00000000..f84b0257
--- /dev/null
+++ b/tools/library/scripts/rt.py
@@ -0,0 +1,796 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+# System imports
+import struct
+import io
+import ctypes
+
+# CUDA Python import
+from cuda import cuda
+from cuda import nvrtc
+
+# CUTLASS imports
+from library import *
+from gemm_operation import EmitGemmUniversalInstance
+
+#################################################################################################
+#
+# CUTLASS Py Runtime Components
+#
+#################################################################################################
+
+#
+def MaxAlignment(fmt):
+  align = 1
+  for x in fmt:
+    align = max(align, struct.calcsize(x))
+  return align
+  
+#
+def AlignedOffset(offset, align):
+  remainder = (offset % align)
+  if remainder:
+    offset += (align - remainder)
+  return offset
+
+#
+def PackInteger(host_workspace, offset, value):
+  fmt = "i"
+  padding = AlignedOffset(offset, 4)  
+  struct.pack_into(fmt, host_workspace, offset, value)
+  return padding + struct.calcsize(fmt)
+
+#
+def PackDevicePointer(host_workspace, offset, value):
+  fmt = "P"
+  offset = AlignedOffset(offset, 8)
+  struct.pack_into(fmt, host_workspace, offset, value)
+  return offset + struct.calcsize(fmt)
+  
+#
+def ceil_div(a, b):
+  return -(a // -b)
+
+#################################################################################################
+
+#
+class PitchLinearCoord:
+  def __init__(self, contiguous, strided):
+    self.contiguous = contiguous
+    self.strided = strided
+
+#
+class GemmCoord:
+  def __init__(self, m = 1, n = 1, k = 1):
+    self.m = m
+    self.n = n
+    self.k = k
+    self.fmt = "iii"
+
+  #
+  def ceil_div(self, rhs):
+    return GemmCoord(ceil_div(self.m, rhs.m), ceil_div(self.n, rhs.n), ceil_div(self.k, rhs.k))
+
+  #
+  def size(self):
+    return struct.calcsize(self.fmt)
+
+  #
+  def alignment(self):
+    return MaxAlignment(self.fmt)
+
+  #
+  def pack_into(self, host_workspace, offset):
+    
+    offset = AlignedOffset(offset, 4)
+  
+    struct.pack_into(
+      self.fmt, 
+      host_workspace, 
+      offset, 
+      self.m, self.n, self.k)
+
+    return offset + self.size()
+
+#
+class TensorRef:
+  def __init__(self, pointer = None, layout = 0):
+    self.pointer = pointer
+    self.layout = layout
+
+  def __str__(self):
+    return "(%x, %d)" % (self.pointer._ptr, self.layout)
+
+#################################################################################################
+
+#
+class PredicatedTileAccessIteratorDesc:
+  '''
+  '''
+  
+  def __init__(
+      self, 
+      element_size_bits, 
+      advance_rank, 
+      threadblock_shape, 
+      threadmap_iterations, 
+      threadmap_delta):
+
+    self.element_size_bits = element_size_bits
+    self.advance_rank = advance_rank
+    self.threadblock_shape = threadblock_shape
+    self.threadmap_iterations = threadmap_iterations
+    self.threadmap_delta = threadmap_delta
+
+#
+class PredicatedTileAccessIteratorParams:
+  '''
+  '''
+  #
+  def __init__(self, desc, label):
+    self.desc = desc
+    self.label = label
+    self.fmt = "qqqq"
+  #
+  def size(self):
+    return struct.calcsize(self.fmt)
+
+  #
+  def alignment(self):
+    return MaxAlignment(self.fmt)
+
+  #
+  def initialize(self, host_workspace, offset, stride):
+
+    offset = AlignedOffset(offset, self.alignment())
+
+    inc_strided = stride *                            \
+                  self.desc.threadmap_delta.strided * \
+                  self.desc.element_size_bits // 8
+
+    if self.desc.advance_rank:
+      inc_advance = self.desc.threadblock_shape.strided * \
+                          stride *                        \
+                          self.desc.element_size_bits // 8
+    else:
+      inc_advance = self.desc.threadblock_shape.contiguous * \
+                          self.desc.element_size_bits // 8
+
+    inc_next = inc_advance - (self.desc.threadmap_iterations.strided - 1) * \
+                      self.desc.threadmap_delta.strided *                   \
+                      stride *                                              \
+                      self.desc.element_size_bits // 8
+
+    struct.pack_into(
+      self.fmt, 
+      host_workspace, 
+      offset, 
+      stride, inc_strided, inc_next, inc_advance)
+
+    return offset + self.size()
+  #
+
+#################################################################################################
+
+#
+class EpilogueTileDesc:
+  '''
+  '''
+  def __init__(self, column, row, group, cluster, tile):
+    self.column = column
+    self.row = row
+    self.group = group
+    self.cluster = cluster
+    self.tile = tile
+
+#
+class EpilogueThreadMap:
+  '''
+  '''
+  def __init__(self, threads, elements_per_access, element_size_bits, shape, iterations, delta, count):
+    self.threads = threads
+    self.elements_per_access = elements_per_access
+    self.element_size_bits = element_size_bits
+    self.shape = shape
+    self.iterations = iterations
+    self.delta = delta
+    self.count = count
+    pass
+
+#
+class EpilogueTileIteratorParams:
+  '''
+  '''
+  #
+  def __init__(self, desc, label):
+    self.desc = desc
+    self.label = label
+    self.fmt = "qqqqqqqq"
+
+  #
+  def size(self):
+    return struct.calcsize(self.fmt)
+
+  #
+  def alignment(self):
+    return MaxAlignment(self.fmt)
+
+  #
+  def initialize(self, host_workspace, offset, stride):
+
+    stride = stride * self.desc.element_size_bits // 8
+
+    offset = AlignedOffset(offset, self.alignment())
+
+    increment_row = stride * self.desc.delta.row
+
+    increment_group = stride * self.desc.delta.group \
+      - stride * self.desc.delta.row * (self.desc.iterations.row - 1)
+
+    increment_cluster = stride * self.desc.delta.cluster \
+      - stride * self.desc.delta.group * (self.desc.iterations.group - 1) \
+      - stride * self.desc.delta.row * (self.desc.iterations.row - 1)
+      
+    advance_row = stride * self.desc.shape.row
+
+    advance_group = stride *                   \
+      (self.desc.shape.group - 1) * \
+      self.desc.shape.row *         \
+      self.desc.count.row
+
+    advance_cluster = stride * \
+      self.desc.count.group * \
+      self.desc.shape.group * \
+      self.desc.count.row   * \
+      self.desc.shape.row
+
+    advance_tile = stride * \
+      self.desc.shape.group * \
+      self.desc.shape.row   * \
+      self.desc.shape.cluster * \
+      self.desc.shape.tile
+
+    struct.pack_into(
+      self.fmt,                                           \
+      host_workspace,                                     \
+      offset,                                             \
+      stride,                                             \
+      increment_row, increment_group, increment_cluster,  \
+      advance_row, advance_group, advance_cluster, advance_tile)
+
+    return offset + self.size()
+#
+
+#################################################################################################
+#
+# Launch configuration
+#
+#################################################################################################
+
+class LaunchConfiguration:
+  def __init__(self, grid = [1,1,1], block = [1,1,1], smem = 0):
+    self.grid = grid
+    self.block = block
+    self.shared_memory_capacity = smem
+
+#################################################################################################
+#
+# Functors
+#
+#################################################################################################
+
+#
+class Functor:
+  def __init__(self):
+    self.decl = ''
+    self.definition = ''
+    self.fmt = ''
+    self.identifier = ''
+
+  #
+  def emit_declaration(self):
+    return self.decl
+
+  #
+  def emit_definition(self):
+    return self.definition
+
+  # 
+  def size(self):
+    '''
+    Size of the packed Params structure
+    '''
+    return struct.calcsize(self.fmt)
+
+  #
+  def alignment(self):
+    return MaxAlignment(self.fmt)
+
+  # 
+  def initialize(self, host_workspace, offset, arguments):
+    return offset + self.size()
+
+#################################################################################################
+
+#
+class LinearCombinationFunctorArguments:
+  def __init__(self, alpha = 1.0, beta = 0.0):
+    self.alpha = alpha
+    self.beta = beta
+    self.alpha_ptr = 0
+    self.beta_ptr = 0
+
+#
+class LinearCombinationFunctor(Functor):
+  def __init__(self):
+    super().__init__()
+
+    self.decl = """
+    cutlass::epilogue::thread::LinearCombination<
+      float,
+      1,
+      float,
+      float
+    >"""
+    self.identifier = 'linear_combination'
+    self.fmt = "ffPP"
+
+  # 
+  def size(self):
+    '''
+    Size of the packed Params structure
+    '''
+    return struct.calcsize(self.fmt)
+
+  #
+  def alignment(self):
+    return MaxAlignment(self.fmt)
+
+  # 
+  def initialize(self, host_workspace, offset, arguments):
+
+    offset = AlignedOffset(offset, self.alignment())
+
+    struct.pack_into(
+      self.fmt, 
+      host_workspace, offset, 
+      arguments.alpha, arguments.beta, arguments.alpha_ptr, arguments.beta_ptr)
+
+    return offset + self.size()
+
+#################################################################################################
+#
+# Base class for an executable operation
+#
+#################################################################################################
+
+#
+class ExecutableOperation:
+  '''
+  '''
+  def __init__(self, operation):
+    self.operation = operation
+    self.module = None
+    self.kernel = None
+
+  #
+  def name(self):
+    return self.operation.procedural_name()
+
+  #
+  def emit(self):
+    return ''
+
+  #
+  def can_implement(self, configuration, arguments):
+    return False
+
+  #
+  def get_host_workspace_size(self, arguments):
+    return 0
+
+  #
+  def get_device_workspace_size(self, arguments):
+    return 0
+
+  #
+  def plan(self, arguments):
+    return LaunchConfiguration()
+
+  #
+  def initialize(self, host_workspace, device_workspace, launch_config, arguments, stream = cuda.CUstream(0)):
+    raise NotImplementedError()
+
+  #
+  def run(self, host_workspace, device_workspace, launch_config, stream = cuda.CUstream(0)):
+
+    cArg = (ctypes.c_char * len(host_workspace)).from_buffer(host_workspace)
+    packed = (ctypes.c_void_p * 1)()
+    packed[0] = ctypes.addressof(cArg)
+
+    err, = cuda.cuLaunchKernel(
+      self.kernel, 
+      launch_config.grid[0], launch_config.grid[1], launch_config.grid[2], 
+      launch_config.block[0], launch_config.block[1], launch_config.block[2], 
+      launch_config.shared_memory_capacity, 
+      stream, 
+      packed, 
+      0)
+
+    return err
+
+#################################################################################################
+
+
+#
+class GemmArguments:
+  '''
+  '''
+  def __init__(self):
+    self.problem_size = GemmCoord(0, 0, 0)
+    self.A = TensorRef()
+    self.B = TensorRef()
+    self.C = TensorRef()
+    self.D = TensorRef()
+    self.output_op = LinearCombinationFunctorArguments()
+
+#
+class ThreadblockSwizzle:
+  def __init__(self, threadblock_shape, log_threadblock_cohort = 0):
+    self.threadblock_shape = threadblock_shape
+    self.log_threadblock_cohort = log_threadblock_cohort
+
+  def grid_tiled_shape(self, problem_size):
+    return GemmCoord(
+      ceil_div(problem_size.m, self.threadblock_shape.m), 
+      ceil_div(problem_size.n, self.threadblock_shape.n), 
+      1)
+
+#
+class Gemm(ExecutableOperation):
+  '''
+  GEMM manages the CUTLASS runtime components
+  '''
+  #
+  def __init__(self, operation):
+    super().__init__(operation)
+
+    self.emitter = EmitGemmUniversalInstance('_type')
+    self.threadblock_swizzle = ThreadblockSwizzle(GemmCoord(128, 128, 8))
+
+    self.threads = 256
+    self.shared_memory_capacity = (32 << 10)
+
+    self.params_A = PredicatedTileAccessIteratorParams(
+        PredicatedTileAccessIteratorDesc(
+          32, 
+          1, 
+          PitchLinearCoord(128, 8), 
+          PitchLinearCoord(1, 4), 
+          PitchLinearCoord(1, 2)), 'A')
+
+    self.params_B = PredicatedTileAccessIteratorParams(
+        PredicatedTileAccessIteratorDesc(
+          32, 
+          1, 
+          PitchLinearCoord(128, 8), 
+          PitchLinearCoord(1, 4), 
+          PitchLinearCoord(1, 2)), 'B')
+
+    self.params_C = EpilogueTileIteratorParams(
+      EpilogueThreadMap(
+        256, 
+        1, 
+        32,
+        EpilogueTileDesc(128, 1, 4, 4, 1), 
+        EpilogueTileDesc(4, 1, 2, 1, 1), 
+        EpilogueTileDesc(32, 1, 8, 1, 1), 
+        EpilogueTileDesc(1, 4, 2, 1, 8)), 'C')
+
+    self.params_D = EpilogueTileIteratorParams(
+      EpilogueThreadMap(
+        256, 
+        1,
+        32,
+        EpilogueTileDesc(128, 1, 4, 4, 1), 
+        EpilogueTileDesc(4, 1, 2, 1, 1), 
+        EpilogueTileDesc(32, 1, 8, 1, 1), 
+        EpilogueTileDesc(1, 4, 2, 1, 8)), 'D')
+
+    self.output_op = LinearCombinationFunctor()
+
+  #
+  def emit(self):
+    return self.emitter.emit(self.operation)
+
+  #
+  def can_implement(self, configuration, arguments):
+    pass
+
+  #
+  def get_host_workspace_size(self, arguments):
+    return 336
+
+  #
+  def get_device_workspace_size(self, arguments):
+    return 0
+
+  #
+  def plan(self, arguments):
+    grid = self.threadblock_swizzle.grid_tiled_shape(arguments.problem_size)
+    return LaunchConfiguration([grid.m, grid.n, grid.k], [self.threads, 1, 1], self.shared_memory_capacity)
+
+  #
+  def initialize(self, host_workspace, device_workspace, launch_config, arguments, stream = cuda.CUstream(0)):
+    
+    offset = 0
+
+    # Compute intermediate results
+    swizzle_log_tile = 0
+    gemm_mode = 0
+    batch_count = 1
+    gemm_k_size = arguments.problem_size.k
+
+    # Pack into the host workspace buffer
+    offset = arguments.problem_size.pack_into(host_workspace, offset)
+
+    grid_tiled_shape = self.threadblock_swizzle.grid_tiled_shape(arguments.problem_size)
+    offset = grid_tiled_shape.pack_into(host_workspace, offset)
+
+    offset = PackInteger(host_workspace, offset, swizzle_log_tile)
+
+    offset = self.params_A.initialize(host_workspace, offset, arguments.A.layout)
+    offset = self.params_B.initialize(host_workspace, offset, arguments.B.layout)
+    offset = self.params_C.initialize(host_workspace, offset, arguments.C.layout)
+    offset = self.params_D.initialize(host_workspace, offset, arguments.D.layout)
+
+    offset = self.output_op.initialize(host_workspace, offset, arguments.output_op)
+
+    offset = PackInteger(host_workspace, offset, gemm_mode)
+    offset = PackInteger(host_workspace, offset, batch_count)
+    offset = PackInteger(host_workspace, offset, gemm_k_size)
+    offset = PackDevicePointer(host_workspace, offset, int(arguments.A.pointer))
+    offset = PackDevicePointer(host_workspace, offset, int(arguments.B.pointer))
+    offset = PackDevicePointer(host_workspace, offset, int(arguments.C.pointer))
+    offset = PackDevicePointer(host_workspace, offset, int(arguments.D.pointer))   
+
+    return offset
+
+
+#################################################################################################
+#
+# Module represents a compilation unit 
+#
+#################################################################################################
+
+#
+class CompilationOptions:
+  '''
+  Compilation options.
+  '''
+
+  #
+  def __init__(self, architectures = [80], include_paths = []):
+    self.includes = []
+    self.include_paths = include_paths
+    self.flags = ['-std=c++11', '-default-device']
+    self.architectures = architectures
+
+  #
+  def get(self):
+    options = []
+
+    for flag in self.flags:
+      options.append(bytes(str.encode(flag)))
+
+    for incl in self.include_paths:
+      options.append(bytes(str.encode('--include-path=%s' % incl)))
+
+    arch_list = "-arch="
+    for idx, arch in enumerate(self.architectures):
+      if idx:
+        arch_list += ","
+      arch_list += "sm_%d" % arch
+
+    options.append(bytes(str.encode(arch_list)))
+
+    return options
+
+IncludeTemplate = r'''#include "${include}"
+'''
+
+KernelTemplate = r'''
+extern "C"
+__global__ void
+${operation_name}(${operation_name}${operation_suffix}::Params params) {
+
+  // Dynamic shared memory base pointer
+  extern __shared__ int SharedStorageBase[];
+
+  // Declare pointer to dynamic shared memory.
+  ${operation_name}${operation_suffix}::SharedStorage *shared_storage =
+      reinterpret_cast<${operation_name}${operation_suffix}::SharedStorage *>(SharedStorageBase);
+
+  ${operation_name}${operation_suffix} op;
+
+  op(params, *shared_storage);
+}
+
+'''
+
+#
+class Module:
+  def __init__(self, name, operations, compilation_options):
+    self.name = name
+    self.operations = operations
+    self.module = None
+    self.log = None
+    self.cubin_image = None
+    self.source_buffer = ''
+
+    #
+    # Emit source
+    #
+    self.emit_()
+
+    #
+    # Compile
+    #
+    self.compile_(compilation_options)
+
+    #
+    # Load module
+    #
+    self.load_()
+    
+    # Done
+    return
+
+  # Emit a source buffer
+  def emit_(self):
+
+    # 1. Includes
+    includes = []
+    for operation in self.operations:
+      for incl in operation.emitter.includes:
+        if incl not in includes:
+          includes.append(incl)
+
+    for incl in includes:
+      self.source_buffer += SubstituteTemplate(IncludeTemplate, { 'include': incl} )
+
+    # 2. Operations
+    for operation in self.operations:
+      self.source_buffer += operation.emit()
+      values = {
+        'operation_name': operation.name(),
+        'operation_suffix': operation.emitter.operation_suffix
+      }
+      self.source_buffer += SubstituteTemplate(KernelTemplate, values)
+
+    # Done
+    return
+
+  # Compile with NVRTC
+  def compile_(self, compilation_options):
+
+    err, program = nvrtc.nvrtcCreateProgram(
+      str.encode(self.source_buffer), 
+      bytes(str.encode(self.name)), 
+      0, [], [])
+
+    if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
+        raise RuntimeError('NVRTC Error: {}'.format(err))
+
+    # Compile program
+    options = compilation_options.get()
+
+    err, = nvrtc.nvrtcCompileProgram(program, len(options), options)
+    if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
+
+      error_string = 'NVRTC Error: {}\n'.format(err)
+
+      # Get log from compilation
+      err, logSize = nvrtc.nvrtcGetProgramLogSize(program)
+      if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
+          raise RuntimeError('NVRTC Error: {}'.format(err))
+      
+      self.log = b' ' * logSize
+      err, = nvrtc.nvrtcGetProgramLog(program, self.log)
+      if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
+          raise RuntimeError('NVRTC Error: {}'.format(err))
+      
+      raise RuntimeError(error_string + self.log.decode() + self.source_buffer)
+
+    # Get data from compilation
+    err, dataSize = nvrtc.nvrtcGetCUBINSize(program)
+    if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
+        raise RuntimeError('NVRTC Error: {}'.format(err))
+    
+    self.cubin_image = b' ' * dataSize
+    err, = nvrtc.nvrtcGetCUBIN(program, self.cubin_image)
+    if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
+        raise RuntimeError('NVRTC Error: {}'.format(err))
+
+    return
+    
+  #
+  def load_(self):
+
+    # Load data as module data
+    err, self.module = cuda.cuModuleLoadData(self.cubin_image)
+    if err != cuda.CUresult.CUDA_SUCCESS:
+        raise RuntimeError('Cuda Error: {}'.format(err))
+    
+    # Get functions
+    for operation in self.operations:
+      err, operation.kernel = cuda.cuModuleGetFunction(
+        self.module, 
+        bytes(str.encode(operation.name())))
+
+      if err != cuda.CUresult.CUDA_SUCCESS:
+          raise RuntimeError('Cuda Error: {}'.format(err))
+
+      operation.module = self
+
+    return
+
+
+#################################################################################################
+#
+# Manifest represents an 'owner' for modules and operations
+#
+#################################################################################################
+
+#
+class Manifest:
+
+  #
+  def __init__(self):
+    self.operations = {}
+    self.modules = []
+    pass
+
+  #
+  def append_module(self, module):
+    '''
+    Appends a module and takes ownership of operations used to construct it.
+    '''
+    
+    self.modules.append(module)
+
+    for operation in module.operations:
+      self.operations[operation.name()] = operation
+
+
+#################################################################################################
diff --git a/tools/library/scripts/symm_operation.py b/tools/library/scripts/symm_operation.py
new file mode 100644
index 00000000..dbee11b4
--- /dev/null
+++ b/tools/library/scripts/symm_operation.py
@@ -0,0 +1,400 @@
+#
+# \file generator.py
+#
+# \brief Generates the CUTLASS Library's instances
+#
+# 
+
+import enum
+import os.path
+import shutil
+import functools
+import operator
+
+from library import *
+
+
+###################################################################################################
+#
+# Data structure modeling a Symm update operation
+#
+###################################################################################################
+
+#
+class SymmOperation:
+  #
+  def __init__(self, symm_kind, arch, tile_description, A, B, C, element_epilogue, \
+      epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity8, \
+      blas_mode = BlasMode.symmetric):
+
+    self.blas_mode = blas_mode
+    self.operation_kind = OperationKind.Symm
+    self.arch = arch
+    self.tile_description = tile_description
+    self.symm_kind = symm_kind
+    # tensor A and B have same data type and layout
+    self.A = A
+    self.B = B  
+    self.C = C
+    self.element_epilogue = element_epilogue
+    self.epilogue_functor = epilogue_functor
+    self.swizzling_functor = swizzling_functor
+
+  #
+  def is_complex(self):
+    complex_operators = [
+      MathOperation.multiply_add_complex, 
+      MathOperation.multiply_add_complex_gaussian,
+      MathOperation.multiply_add_complex_fast_f32
+    ]
+    return self.tile_description.math_instruction.math_operation in complex_operators
+    return False
+
+  #
+  def is_planar_complex(self):
+    return False
+
+  #
+  def accumulator_type(self):
+    accum = self.tile_description.math_instruction.element_accumulator
+
+    if self.is_complex():
+      return get_complex_from_real(accum)
+
+    return accum
+
+  #
+  def short_math_name(self):
+    if self.tile_description.math_instruction.math_operation == MathOperation.multiply_add_complex_gaussian:
+      return "g%s" % ShortDataTypeNames[self.accumulator_type()]
+    return ShortDataTypeNames[self.accumulator_type()]
+
+
+  #
+  def core_name(self):
+    ''' The basic operation kind is prefixed with a letter indicating the accumulation type. '''
+    
+    inst_shape = ''
+    inst_operation = ''
+    intermediate_type = ''
+
+    math_operations_map = {
+      MathOperation.xor_popc: 'xor',
+    }
+
+    if self.tile_description.math_instruction.opcode_class == OpcodeClass.TensorOp or \
+      self.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp:
+
+      math_op = self.tile_description.math_instruction.math_operation
+      math_op_string = math_operations_map[math_op] if math_op in math_operations_map.keys() else ''
+
+      inst_shape = "%d%d%d" % tuple(self.tile_description.math_instruction.instruction_shape)
+      inst_shape += math_op_string
+
+      if self.tile_description.math_instruction.element_a != self.A.element and \
+        self.tile_description.math_instruction.element_a != self.tile_description.math_instruction.element_accumulator:
+        intermediate_type = DataTypeNames[self.tile_description.math_instruction.element_a]
+
+    operation_name = 'symm' if self.blas_mode == BlasMode.symmetric else 'hemm'
+
+    return "%s%s%s%s" % (self.short_math_name(), inst_shape, intermediate_type, operation_name)
+
+  #
+  def extended_name(self):
+    ''' Append data types if they differ from compute type. '''
+    if self.is_complex():
+      extended_name = "${core_name}"
+    else:
+      if self.C.element != self.tile_description.math_instruction.element_accumulator and \
+        self.A.element != self.tile_description.math_instruction.element_accumulator:
+        extended_name = "${element_c}_${core_name}_${element_a}"
+      elif self.C.element == self.tile_description.math_instruction.element_accumulator and  \
+        self.A.element != self.tile_description.math_instruction.element_accumulator:
+        extended_name = "${core_name}_${element_a}"
+      else:
+        extended_name = "${core_name}"
+
+    extended_name = SubstituteTemplate(extended_name, {
+      'element_a': DataTypeNames[self.A.element],
+      'element_c': DataTypeNames[self.C.element],
+      'core_name': self.core_name()
+      })
+
+    return extended_name
+
+  #
+  def layout_name(self):
+    if self.is_complex() or self.is_planar_complex():
+      return "%s" % (
+        ShortComplexLayoutNames[(self.A.layout, self.A.complex_transform)] 
+      )
+    return "%s" % (ShortLayoutTypeNames[self.A.layout])
+
+  #
+  def side_mode_name(self):
+    return "%s" % (ShortSideModeNames[self.A.side_mode])
+
+  #
+  def fill_mode_name(self):
+    return "%s" % (ShortFillModeNames[self.A.fill_mode])
+
+  #
+  def procedural_name(self):
+    ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
+    threadblock = self.tile_description.procedural_name()
+
+    opcode_class_name = OpcodeClassNames[self.tile_description.math_instruction.opcode_class]
+
+    alignment = self.C.alignment
+
+    return SubstituteTemplate(
+      "cutlass_${opcode_class}_${extended_name}_${threadblock}_${layout}_${side_mode}_${fill_mode}_align${alignment}",
+      {
+        'opcode_class': opcode_class_name,
+        'extended_name': self.extended_name(),
+        'threadblock': threadblock,
+        'layout': self.layout_name(),
+        'side_mode': self.side_mode_name(),
+        'fill_mode': self.fill_mode_name(),
+        'alignment': "%d" % alignment,
+      }
+    )
+
+  #
+  def configuration_name(self):
+    ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
+    return self.procedural_name()
+
+###################################################################################################
+#
+# Emits single instances of a CUTLASS device-wide operator
+#
+###################################################################################################
+
+#
+class EmitSymmUniversalInstance:
+  ''' Responsible for emitting a CUTLASS template definition'''
+
+  def __init__(self):
+    self.symm_template = """
+// Symm operator ${operation_name}
+using Operation_${operation_name} = 
+  typename cutlass::gemm::device::Symm<
+    ${element_a}, ${layout_a}, ${side_mode}, ${fill_mode}, 
+    ${element_b}, ${layout_b}, 
+    ${element_c}, ${layout_c},
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${epilogue_functor}<
+      ${element_c},
+      ${epilogue_vector_length},
+      ${element_accumulator},
+      ${element_epilogue}
+    >,
+    ${swizzling_functor},
+    ${stages},
+    ${align_a},
+    ${align_b},
+    ${split_k_serial},
+    ${math_operation}
+>;
+"""
+    self.symm_complex_template = """
+// Symm operator ${operation_name}
+using Operation_${operation_name} = 
+  typename cutlass::gemm::device::Symm<
+    ${element_a}, ${layout_a}, ${side_mode}, ${fill_mode}, 
+    ${element_b}, ${layout_b}, 
+    ${element_c}, ${layout_c},
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${epilogue_functor}<
+      ${element_c},
+      ${epilogue_vector_length},
+      ${element_accumulator},
+      ${element_epilogue}
+    >,
+    ${swizzling_functor},
+    ${stages},
+    ${align_a},
+    ${align_b},
+    ${split_k_serial},
+    ${math_operation},
+    ${blas_mode}
+>;
+"""
+
+  def emit(self, operation):
+
+    threadblock_shape = operation.tile_description.threadblock_shape
+    
+    warp_count = operation.tile_description.warp_count
+    warp_shape = [threadblock_shape[idx] // warp_count[idx] for idx in range(3)]
+
+    epilogue_vector_length = int(min(operation.C.alignment * DataTypeSize[operation.C.element], 128) / DataTypeSize[operation.C.element])
+
+    values = {
+      'operation_name': operation.procedural_name(),
+      'element_a': DataTypeTag[operation.A.element],
+      'layout_a': LayoutTag[operation.A.layout],
+      'side_mode': SideModeTag[operation.A.side_mode],
+      'fill_mode': FillModeTag[operation.A.fill_mode],
+      'element_b': DataTypeTag[operation.B.element],
+      'layout_b': LayoutTag[operation.B.layout],
+      'element_c': DataTypeTag[operation.C.element],
+      'layout_c': LayoutTag[operation.C.layout],
+      'element_accumulator': DataTypeTag[operation.accumulator_type()],
+      'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
+      'arch': "cutlass::arch::Sm%d" % operation.arch,
+      'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]),
+      'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]),
+      'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]),
+      'warp_shape_m': str(warp_shape[0]),
+      'warp_shape_n': str(warp_shape[1]),
+      'warp_shape_k': str(warp_shape[2]),
+      'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]),
+      'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]),
+      'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]),
+      'epilogue_vector_length': str(epilogue_vector_length),
+      'element_epilogue': str(DataTypeTag[operation.element_epilogue]),
+      'epilogue_functor': EpilogueFunctorTag[operation.epilogue_functor],
+      'swizzling_functor': SwizzlingFunctorTag[operation.swizzling_functor],
+      'stages': str(operation.tile_description.stages),
+      'align_a': str(operation.A.alignment),
+      'align_b': str(operation.B.alignment),
+      'split_k_serial': 'false', 
+      'math_operation': MathOperationTag[operation.tile_description.math_instruction.math_operation],
+      'blas_mode': BlasModeTag[operation.blas_mode]
+    }
+
+    symm_template = self.symm_complex_template if operation.is_complex() else self.symm_template
+
+    return SubstituteTemplate(symm_template, values)
+
+###################################################################################################
+
+
+###################################################################################################
+#
+# Emitters functions for all targets
+#
+###################################################################################################
+
+class EmitSymmConfigurationLibrary:
+  def __init__(self, operation_path, configuration_name):
+    self.configuration_name = configuration_name
+    self.configuration_path = os.path.join(operation_path, "%s.cu" % configuration_name).replace('\\', '/')
+
+    self.instance_emitter = {
+      SymmKind.Universal: EmitSymmUniversalInstance,
+    }
+
+    self.symm_kind_wrappers = {
+      SymmKind.Universal: 'SymmOperation',
+    }
+
+    self.instance_template = {
+      SymmKind.Universal: """
+${compile_guard_start}
+  manifest.append(new ${symm_kind}<
+    Operation_${operation_name}
+  >("${operation_name}"));
+${compile_guard_end}
+"""
+    }
+
+    self.header_template = """
+/*
+  Generated by symm_operation.py - Do not edit.
+*/
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+#include "cutlass/cutlass.h"
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+
+#include "library_internal.h"
+#include "symm_operation.h"
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+"""
+
+    self.initialize_function_template = """
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+void initialize_${configuration_name}(Manifest &manifest) {
+
+"""
+    self.epilogue_template = """
+
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+"""
+
+  def __enter__(self):
+    self.configuration_file = open(self.configuration_path, "w")
+    self.configuration_file.write(self.header_template)
+
+    self.instance_definitions = []
+    self.instance_wrappers = []
+
+    self.operations = []
+    return self
+
+  def emit(self, operation):
+    emitter = self.instance_emitter[operation.symm_kind]()
+
+    self.operations.append(operation)
+
+    self.instance_definitions.append(emitter.emit(operation))
+
+    self.instance_wrappers.append(SubstituteTemplate(self.instance_template[operation.symm_kind], {
+      'configuration_name': self.configuration_name,
+      'operation_name': operation.procedural_name(),
+      'symm_kind': self.symm_kind_wrappers[operation.symm_kind],
+      'compile_guard_start': SubstituteTemplate(self.wmma_guard_start, {'sm_number': str(operation.arch)}) \
+        if operation.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp else "",
+      'compile_guard_end': "#endif" \
+        if operation.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp else "" 
+      }))
+
+  def __exit__(self, exception_type, exception_value, traceback):
+
+    # Write instance definitions in top-level namespace
+    for instance_definition in self.instance_definitions:
+      self.configuration_file.write(instance_definition)
+
+    # Add wrapper objects within initialize() function
+    self.configuration_file.write(SubstituteTemplate(self.initialize_function_template, {
+      'configuration_name': self.configuration_name
+      }))
+   
+    for instance_wrapper in self.instance_wrappers:
+      self.configuration_file.write(instance_wrapper) 
+
+    self.configuration_file.write(self.epilogue_template)
+    self.configuration_file.close()
+
+###################################################################################################
diff --git a/tools/library/scripts/trmm_operation.py b/tools/library/scripts/trmm_operation.py
new file mode 100644
index 00000000..7e03e278
--- /dev/null
+++ b/tools/library/scripts/trmm_operation.py
@@ -0,0 +1,407 @@
+#
+# \file generator.py
+#
+# \brief Generates the CUTLASS Library's instances
+#
+# 
+
+import enum
+import os.path
+import shutil
+import functools
+import operator
+
+from library import *
+
+
+###################################################################################################
+#
+# Data structure modeling a TRMM operation
+#
+###################################################################################################
+
+#
+class TrmmOperation:
+  #
+  def __init__(self, trmm_kind, arch, tile_description, A, B, C, element_epilogue, \
+      epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity8):
+
+    self.operation_kind = OperationKind.Trmm
+    self.arch = arch
+    self.tile_description = tile_description
+    self.trmm_kind = trmm_kind
+    self.A = A
+    self.B = B
+    self.C = C
+    self.element_epilogue = element_epilogue
+    self.epilogue_functor = epilogue_functor
+    self.swizzling_functor = swizzling_functor
+
+  #
+  def is_complex(self):
+    complex_operators = [
+      MathOperation.multiply_add_complex, 
+      MathOperation.multiply_add_complex_gaussian,
+      MathOperation.multiply_add_complex_fast_f32
+    ]
+    return self.tile_description.math_instruction.math_operation in complex_operators
+    return False
+
+  #
+  def is_planar_complex(self):
+#   return self.trmm_kind in (TrmmKind.PlanarComplex, TrmmKind.PlanarComplexArray)
+    return False
+
+  #
+  def accumulator_type(self):
+    accum = self.tile_description.math_instruction.element_accumulator
+
+    if self.is_complex():
+      return get_complex_from_real(accum)
+
+    return accum
+
+  #
+  def short_math_name(self):
+    if self.tile_description.math_instruction.math_operation == MathOperation.multiply_add_complex_gaussian:
+      return "g%s" % ShortDataTypeNames[self.accumulator_type()]
+    return ShortDataTypeNames[self.accumulator_type()]
+
+
+  #
+  def core_name(self):
+    ''' The basic operation kind is prefixed with a letter indicating the accumulation type. '''
+    
+    inst_shape = ''
+    inst_operation = ''
+    intermediate_type = ''
+
+    math_operations_map = {
+      MathOperation.xor_popc: 'xor',
+    }
+
+    if self.tile_description.math_instruction.opcode_class == OpcodeClass.TensorOp or \
+      self.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp:
+
+      math_op = self.tile_description.math_instruction.math_operation
+      math_op_string = math_operations_map[math_op] if math_op in math_operations_map.keys() else ''
+
+      inst_shape = "%d%d%d" % tuple(self.tile_description.math_instruction.instruction_shape)
+      inst_shape += math_op_string
+
+      if self.tile_description.math_instruction.element_a != self.A.element and \
+        self.tile_description.math_instruction.element_a != self.tile_description.math_instruction.element_accumulator:
+        intermediate_type = DataTypeNames[self.tile_description.math_instruction.element_a]
+
+    return "%s%s%s%s" % (self.short_math_name(), inst_shape, intermediate_type, TrmmKindNames[self.trmm_kind])
+
+  #
+  def extended_name(self):
+    ''' Append data types if they differ from compute type. '''
+    if self.is_complex():
+      extended_name = "${core_name}"
+    else:
+      if self.C.element != self.tile_description.math_instruction.element_accumulator and \
+        self.A.element != self.tile_description.math_instruction.element_accumulator:
+        extended_name = "${element_c}_${core_name}_${element_a}"
+      elif self.C.element == self.tile_description.math_instruction.element_accumulator and  \
+        self.A.element != self.tile_description.math_instruction.element_accumulator:
+        extended_name = "${core_name}_${element_a}"
+      else:
+        extended_name = "${core_name}"
+
+    extended_name = SubstituteTemplate(extended_name, {
+      'element_a': DataTypeNames[self.A.element],
+      'element_c': DataTypeNames[self.C.element],
+      'core_name': self.core_name()
+      })
+
+    return extended_name
+
+  #
+  def layout_name(self):
+    if self.is_complex() or self.is_planar_complex():
+      return "%s%s" % (
+        ShortComplexLayoutNames[(self.A.layout, self.A.complex_transform)], 
+        ShortComplexLayoutNames[(self.B.layout, self.B.complex_transform)] 
+      )
+    return "%s%s" % (ShortLayoutTypeNames[self.A.layout], ShortLayoutTypeNames[self.B.layout])
+
+  #
+  def side_mode_name(self):
+    return "%s" % (ShortSideModeNames[self.A.side_mode])
+
+  #
+  def fill_mode_name(self):
+    return "%s" % (ShortFillModeNames[self.A.fill_mode])
+
+  #
+  def diag_type_name(self):
+    return "%s" % (ShortDiagTypeNames[self.A.diag_type])
+
+  #
+  def procedural_name(self):
+    ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
+    threadblock = self.tile_description.procedural_name()
+
+    opcode_class_name = OpcodeClassNames[self.tile_description.math_instruction.opcode_class]
+
+    alignment = max([self.C.alignment])
+
+    return SubstituteTemplate(
+      "cutlass_${opcode_class}_${extended_name}_${threadblock}_${layout}_${side_mode}_${fill_mode}_${diag_type}_align${alignment}",
+      {
+        'opcode_class': opcode_class_name,
+        'extended_name': self.extended_name(),
+        'threadblock': threadblock,
+        'layout': self.layout_name(),
+        'side_mode': self.side_mode_name(),
+        'fill_mode': self.fill_mode_name(),
+        'diag_type': self.diag_type_name(),
+        'alignment': "%d" % self.C.alignment,
+      }
+    )
+
+  #
+  def configuration_name(self):
+    ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
+    return self.procedural_name()
+
+###################################################################################################
+#
+# Emits single instances of a CUTLASS device-wide operator
+#
+###################################################################################################
+
+#
+class EmitTrmmUniversalInstance:
+  ''' Responsible for emitting a CUTLASS template definition'''
+
+  def __init__(self):
+    self.trmm_template = """
+// Trmm operator ${operation_name}
+using Operation_${operation_name} = 
+  typename cutlass::gemm::device::Trmm<
+    ${element_a}, ${layout_a},
+    ${side_mode}, ${fill_mode}, ${diag_type}, 
+    ${element_b}, ${layout_b}, 
+    ${element_c}, ${layout_c},
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${epilogue_functor}<
+      ${element_c},
+      ${epilogue_vector_length},
+      ${element_accumulator},
+      ${element_epilogue},
+      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
+    >,
+    ${swizzling_functor},
+    ${stages},
+    ${align_a},
+    ${align_b},
+    ${split_k_serial},
+    ${math_operation}
+>;
+"""
+    self.trmm_complex_template = """
+// Trmm operator ${operation_name}
+using Operation_${operation_name} = 
+  typename cutlass::gemm::device::Trmm<
+    ${element_a}, ${layout_a}, 
+    ${side_mode}, ${fill_mode}, ${diag_type}, 
+    ${element_b}, ${layout_b}, 
+    ${element_c}, ${layout_c},
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${epilogue_functor}<
+      ${element_c},
+      ${epilogue_vector_length},
+      ${element_accumulator},
+      ${element_epilogue},
+      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
+    >,
+    ${swizzling_functor},
+    ${stages},
+    ${align_a},
+    ${align_b},
+    ${split_k_serial},
+    ${math_operation},
+    ${transform_a} 
+>;
+"""
+
+  def emit(self, operation):
+
+    threadblock_shape = operation.tile_description.threadblock_shape
+    warp_count = operation.tile_description.warp_count
+
+    warp_shape = [threadblock_shape[idx] // warp_count[idx] for idx in range(3)]
+
+    epilogue_vector_length = int(min(operation.C.alignment * DataTypeSize[operation.C.element], 128) / DataTypeSize[operation.C.element])
+
+    values = {
+      'operation_name': operation.procedural_name(),
+      'element_a': DataTypeTag[operation.A.element],
+      'layout_a': LayoutTag[operation.A.layout],
+      'side_mode' : SideModeTag[operation.A.side_mode], 
+      'fill_mode': FillModeTag[operation.A.fill_mode],
+      'diag_type' : DiagTypeTag[operation.A.diag_type],
+      'element_b': DataTypeTag[operation.B.element],
+      'layout_b': LayoutTag[operation.B.layout],
+      'element_c': DataTypeTag[operation.C.element],
+      'layout_c': LayoutTag[operation.C.layout],
+      'element_accumulator': DataTypeTag[operation.accumulator_type()],
+      'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
+      'arch': "cutlass::arch::Sm%d" % operation.arch,
+      'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]),
+      'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]),
+      'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]),
+      'warp_shape_m': str(warp_shape[0]),
+      'warp_shape_n': str(warp_shape[1]),
+      'warp_shape_k': str(warp_shape[2]),
+      'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]),
+      'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]),
+      'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]),
+      'epilogue_vector_length': str(epilogue_vector_length),
+      'element_epilogue': str(DataTypeTag[operation.element_epilogue]),
+      'epilogue_functor': EpilogueFunctorTag[operation.epilogue_functor],
+      'swizzling_functor': SwizzlingFunctorTag[operation.swizzling_functor],
+      'stages': str(operation.tile_description.stages),
+      'align_a': str(1),  # TRMM A's alignment is always 1 for no padding to work until we make zfill work with variable bytes
+      'align_b': str(operation.B.alignment),
+      'split_k_serial': 'false', 
+      'math_operation': MathOperationTag[operation.tile_description.math_instruction.math_operation],
+      'transform_a': ComplexTransformTag[operation.A.complex_transform]
+    }
+
+    trmm_template = self.trmm_complex_template if operation.is_complex() else self.trmm_template
+
+    return SubstituteTemplate(trmm_template, values)
+
+###################################################################################################
+
+
+###################################################################################################
+#
+# Emitters functions for all targets
+#
+###################################################################################################
+
+class EmitTrmmConfigurationLibrary:
+  def __init__(self, operation_path, configuration_name):
+    self.configuration_name = configuration_name
+    self.configuration_path = os.path.join(operation_path, "%s.cu" % configuration_name).replace('\\', '/')
+
+    self.instance_emitter = {
+      TrmmKind.Universal: EmitTrmmUniversalInstance,
+    }
+
+    self.trmm_kind_wrappers = {
+      TrmmKind.Universal: 'TrmmOperation',
+    }
+
+    self.instance_template = {
+      TrmmKind.Universal: """
+${compile_guard_start}
+  manifest.append(new ${trmm_kind}<
+    Operation_${operation_name}
+  >("${operation_name}"));
+${compile_guard_end}
+"""
+    }
+
+    self.header_template = """
+/*
+  Generated by trmm_operation.py - Do not edit.
+*/
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+#include "cutlass/cutlass.h"
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+
+#include "library_internal.h"
+#include "trmm_operation.h"
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+"""
+
+    self.initialize_function_template = """
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+void initialize_${configuration_name}(Manifest &manifest) {
+
+"""
+    self.epilogue_template = """
+
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+"""
+
+  def __enter__(self):
+    self.configuration_file = open(self.configuration_path, "w")
+    self.configuration_file.write(self.header_template)
+
+    self.instance_definitions = []
+    self.instance_wrappers = []
+
+    self.operations = []
+    return self
+
+  def emit(self, operation):
+    emitter = self.instance_emitter[operation.trmm_kind]()
+
+    self.operations.append(operation)
+
+    self.instance_definitions.append(emitter.emit(operation))
+
+    self.instance_wrappers.append(SubstituteTemplate(self.instance_template[operation.trmm_kind], {
+      'configuration_name': self.configuration_name,
+      'operation_name': operation.procedural_name(),
+      'trmm_kind': self.trmm_kind_wrappers[operation.trmm_kind],
+      'compile_guard_start': SubstituteTemplate(self.wmma_guard_start, {'sm_number': str(operation.arch)}) \
+        if operation.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp else "",
+      'compile_guard_end': "#endif" \
+        if operation.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp else "" 
+      }))
+
+  def __exit__(self, exception_type, exception_value, traceback):
+
+    # Write instance definitions in top-level namespace
+    for instance_definition in self.instance_definitions:
+      self.configuration_file.write(instance_definition)
+
+    # Add wrapper objects within initialize() function
+    self.configuration_file.write(SubstituteTemplate(self.initialize_function_template, {
+      'configuration_name': self.configuration_name
+      }))
+   
+    for instance_wrapper in self.instance_wrappers:
+      self.configuration_file.write(instance_wrapper) 
+
+    self.configuration_file.write(self.epilogue_template)
+    self.configuration_file.close()
+
+###################################################################################################
diff --git a/tools/library/src/conv2d_operation.h b/tools/library/src/conv2d_operation.h
index 9f7fc89a..4ee58f0d 100644
--- a/tools/library/src/conv2d_operation.h
+++ b/tools/library/src/conv2d_operation.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2022, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/library/src/conv3d_operation.h b/tools/library/src/conv3d_operation.h
index 704cd3ca..34d88a6e 100644
--- a/tools/library/src/conv3d_operation.h
+++ b/tools/library/src/conv3d_operation.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2022, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/library/src/gemm_operation.h b/tools/library/src/gemm_operation.h
index 31f76e9b..84ce0a91 100644
--- a/tools/library/src/gemm_operation.h
+++ b/tools/library/src/gemm_operation.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2022, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -1142,6 +1148,208 @@ class GemmPlanarComplexArrayOperation : public GemmOperationBase<Operator_> {
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
+template <typename Operator_>
+class GemmGroupedOperation : public GemmOperationBase<Operator_> {
+public:
+
+  using Operator = Operator_;
+  using ElementA = typename Operator::ElementA;
+  using LayoutA = typename Operator::LayoutA;
+  using ElementB = typename Operator::ElementB;
+  using LayoutB = typename Operator::LayoutB;
+  using ElementC = typename Operator::ElementC;
+  using LayoutC = typename Operator::LayoutC;
+  using ElementAccumulator = typename Operator::ElementAccumulator;
+  using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
+
+  using OperatorArguments = typename Operator::Arguments;
+
+public:
+
+  /// Constructor
+  GemmGroupedOperation(char const *name = "unknown_gemm"):
+    GemmOperationBase<Operator_>(name) {
+
+    this->description_.gemm_kind = GemmKind::kGrouped;
+  }
+
+protected:
+
+  /// Constructs the arguments structure given the configuration and arguments
+  static Status construct_arguments_(
+    OperatorArguments &op_args,
+    GemmGroupedConfiguration const *config) {
+
+    op_args.problem_count = config->problem_count;
+    op_args.threadblock_count = config->threadblock_count;
+
+    return Status::kSuccess;
+  }
+
+  /// Constructs the arguments structure given the configuration and arguments
+  static Status update_arguments_(
+    OperatorArguments &op_args,
+    GemmGroupedArguments const *arguments) {
+
+    if (arguments->pointer_mode == ScalarPointerMode::kHost) {
+
+      typename Operator::EpilogueOutputOp::Params params(
+        *static_cast<ElementCompute const *>(arguments->alpha),
+        *static_cast<ElementCompute const *>(arguments->beta)
+      );
+
+      op_args.output_op = params;
+    }
+    else if (arguments->pointer_mode == ScalarPointerMode::kDevice) {
+
+      typename Operator::EpilogueOutputOp::Params params(
+        static_cast<ElementCompute const *>(arguments->alpha),
+        static_cast<ElementCompute const *>(arguments->beta)
+      );
+
+      op_args.output_op = params;
+    }
+    else {
+      return Status::kErrorInvalidProblem;
+    }
+
+    op_args.problem_sizes = arguments->problem_sizes;
+
+    op_args.ptr_A         = static_cast<ElementA **>(arguments->ptr_A);
+    op_args.ptr_B         = static_cast<ElementB **>(arguments->ptr_B);
+    op_args.ptr_C         = static_cast<ElementC **>(arguments->ptr_C);
+    op_args.ptr_D         = static_cast<ElementC **>(arguments->ptr_D);
+
+    op_args.lda           = arguments->lda;
+    op_args.ldb           = arguments->ldb;
+    op_args.ldc           = arguments->ldc;
+    op_args.ldd           = arguments->ldd;
+
+    return Status::kSuccess;
+  }
+
+public:
+
+  /// Returns success if the operation can proceed
+  virtual Status can_implement(
+    void const *configuration_ptr,
+    void const *arguments_ptr) const {
+
+    GemmGroupedConfiguration const *configuration =
+      static_cast<GemmGroupedConfiguration const *>(configuration_ptr);
+
+    GemmGroupedArguments const *arguments =
+      static_cast<GemmGroupedArguments const *>(arguments_ptr);
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(args, configuration);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = update_arguments_(args, arguments);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return Operator::can_implement(args);
+  }
+
+  /// Gets the host-side workspace
+  virtual uint64_t get_host_workspace_size(
+    void const *configuration) const {
+
+    return sizeof(Operator);
+  }
+
+  /// Gets the device-side workspace
+  virtual uint64_t get_device_workspace_size(
+    void const *configuration_ptr,
+    void const *arguments_ptr) const {
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(
+      args,
+      static_cast<GemmGroupedConfiguration const *>(configuration_ptr));
+
+    if (status != Status::kSuccess) {
+      return 0;
+    }
+
+    status = update_arguments_(
+      args,
+      static_cast<GemmGroupedArguments const *>(arguments_ptr));
+
+    if (status != Status::kSuccess) {
+      return 0;
+    }
+
+    uint64_t size = Operator::get_workspace_size(args);
+
+    return size;
+  }
+
+  /// Initializes the workspace
+  virtual Status initialize(
+    void const *configuration_ptr,
+    void *host_workspace,
+    void *device_workspace,
+    cudaStream_t stream = nullptr) const {
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(
+      args,
+      static_cast<GemmGroupedConfiguration const *>(configuration_ptr));
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    Operator *op = new (host_workspace) Operator;
+
+    status = op->initialize(args, device_workspace, stream);
+
+    return status;
+  }
+
+  /// Runs the kernel
+  virtual Status run(
+    void const *arguments_ptr,
+    void *host_workspace,
+    void *device_workspace = nullptr,
+    cudaStream_t stream = nullptr) const {
+
+    OperatorArguments args;
+
+    Status status = update_arguments_(
+      args,
+      static_cast<GemmGroupedArguments const *>(arguments_ptr));
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    Operator *op = static_cast<Operator *>(host_workspace);
+
+    status = op->update(args, device_workspace);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = op->run(stream);
+
+    return status;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
 } // namespace library
 } // namespace cutlass
 
diff --git a/tools/library/src/handle.cu b/tools/library/src/handle.cu
index 88243aa8..c2ee5409 100644
--- a/tools/library/src/handle.cu
+++ b/tools/library/src/handle.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2022, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -648,6 +654,7 @@ Status Handle::gemm_universal(
   }
 
   // Run the operator
+
   return operation->run(&arguments, host_workspace, workspace_, stream_);
 }
 
diff --git a/tools/library/src/library_internal.h b/tools/library/src/library_internal.h
index 8050e9ac..91e429b8 100644
--- a/tools/library/src/library_internal.h
+++ b/tools/library/src/library_internal.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -318,6 +324,15 @@ template <> struct IteratorAlgorithmMap<conv::IteratorAlgorithm::kAnalytic> {
 template <> struct IteratorAlgorithmMap<conv::IteratorAlgorithm::kOptimized> {
   static IteratorAlgorithmID const kId = IteratorAlgorithmID::kOptimized;
 };
+
+template <> struct IteratorAlgorithmMap<conv::IteratorAlgorithm::kFixedChannels> {
+  static IteratorAlgorithmID const kId = IteratorAlgorithmID::kFixedChannels;
+};
+
+template <> struct IteratorAlgorithmMap<conv::IteratorAlgorithm::kFewChannels> {
+  static IteratorAlgorithmID const kId = IteratorAlgorithmID::kFewChannels;
+};
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 template <typename Element, typename Layout>
diff --git a/tools/library/src/manifest.cpp b/tools/library/src/manifest.cpp
index bacada10..ebd5b46d 100644
--- a/tools/library/src/manifest.cpp
+++ b/tools/library/src/manifest.cpp
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/library/src/operation_table.cu b/tools/library/src/operation_table.cu
index 20e80471..b31e042b 100644
--- a/tools/library/src/operation_table.cu
+++ b/tools/library/src/operation_table.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -75,7 +81,6 @@ void OperationTable::append(Manifest const &manifest) {
       gemm_operations[functional_key][preference_key].push_back(op);
     }
 
-
     // insert all conv2d or conv3d operation into operation table
     if (desc.kind == OperationKind::kConv2d || desc.kind == OperationKind::kConv3d) {
       auto &conv_desc = static_cast<library::ConvDescription const &>(desc);
diff --git a/tools/library/src/rank_2k_operation.h b/tools/library/src/rank_2k_operation.h
new file mode 100644
index 00000000..7e186161
--- /dev/null
+++ b/tools/library/src/rank_2k_operation.h
@@ -0,0 +1,373 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Defines operations for all Rank 2K operation kinds (Syr2k, Her2k) 
+    in CUTLASS Library.
+
+  
+*/
+
+#pragma once
+#include <iostream>
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/device/rank_2k.h"
+#include "cutlass/gemm/kernel/default_rank_2k_universal.h"
+
+#include "cutlass/library/library.h"
+#include "library_internal.h"
+#include "cutlass/core_io.h"
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Operator_>
+class Rank2KOperationBase : public Operation {
+public:
+  using Operator = Operator_;
+  using ElementA = typename Operator::ElementA;
+  using LayoutA = typename Operator::LayoutA;
+  using ElementB = typename Operator::ElementB;
+  using LayoutB = typename Operator::LayoutB;
+  using ElementC = typename Operator::ElementC;
+  using LayoutC = typename Operator::LayoutC;
+  using ElementAccumulator = typename Operator::ElementAccumulator;
+  using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
+  static BlasMode const kBlasMode = Operator::kBlasMode;
+  static int const kUpdateRank = Operator::kUpdateRank;
+  static FillMode const kFillModeC = Operator::kFillModeC;
+
+  using OperatorArguments = typename Operator::Arguments;
+
+protected:
+
+  /// 
+  RankKDescription description_;
+
+public:
+
+  /// Constructor
+  Rank2KOperationBase(char const *name = "unknown_rank_k") {
+
+    description_.name = name;
+    description_.provider = Provider::kCUTLASS;
+    description_.rank_k_kind = RankKKind::kUniversal;
+    description_.fill_mode = kFillModeC;    
+    description_.blas_mode = kBlasMode;
+    description_.num_ranks = kUpdateRank;
+
+    description_.kind = OperationKind::kRank2K;
+
+    description_.tile_description.threadblock_shape = make_Coord(
+      Operator::ThreadblockShape::kM,
+      Operator::ThreadblockShape::kN,
+      Operator::ThreadblockShape::kK);
+
+    description_.tile_description.threadblock_stages = Operator::kStages;
+
+    description_.tile_description.warp_count = make_Coord(
+      Operator::Rank2Kkernel::WarpCount::kM,
+      Operator::Rank2Kkernel::WarpCount::kN,
+      Operator::Rank2Kkernel::WarpCount::kK);
+    
+    description_.tile_description.math_instruction.instruction_shape = make_Coord(
+      Operator::InstructionShape::kM,
+      Operator::InstructionShape::kN,
+      Operator::InstructionShape::kK);
+
+    description_.tile_description.math_instruction.element_accumulator = 
+      NumericTypeMap<ElementAccumulator>::kId;
+
+    description_.tile_description.math_instruction.opcode_class = 
+      OpcodeClassMap<typename Operator::OperatorClass>::kId;
+
+    description_.tile_description.math_instruction.math_operation =
+      MathOperationMap<typename Operator::Operator>::kId;
+
+    description_.tile_description.minimum_compute_capability = 
+      ArchMap<typename Operator::ArchTag, typename Operator::OperatorClass>::kMin;
+
+    description_.tile_description.maximum_compute_capability = 
+      ArchMap<typename Operator::ArchTag, typename Operator::OperatorClass>::kMax;
+    
+    description_.A = make_TensorDescription<ElementA, LayoutA>(Operator::kAlignmentA);
+    description_.B = make_TensorDescription<ElementB, LayoutB>(Operator::kAlignmentB);
+    description_.C = make_TensorDescription<ElementC, LayoutC>(Operator::kAlignmentC);
+    description_.element_epilogue = NumericTypeMap<ElementCompute>::kId;
+
+    description_.split_k_mode = SplitKMode::kNone;
+    description_.transform_A = ComplexTransformMap<Operator::kTransformA>::kId;
+    description_.transform_B = ComplexTransformMap<Operator::kTransformB>::kId;
+  }
+  
+  /// Returns the description of the SYRK operation
+  virtual OperationDescription const & description() const {
+    return description_;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Operator_>
+class Rank2KOperation : public Rank2KOperationBase<Operator_> {
+public:
+
+  using Operator = Operator_;
+  using ElementA = typename Operator::ElementA;
+  using LayoutA = typename Operator::LayoutA;
+  using ElementB = typename Operator::ElementB;
+  using LayoutB = typename Operator::LayoutB;
+  using ElementC = typename Operator::ElementC;
+  using LayoutC = typename Operator::LayoutC;
+
+  using ElementAccumulator = typename Operator::ElementAccumulator;
+  using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
+
+  static BlasMode const kBlasMode = Operator::kBlasMode;
+  static int const kUpdateRank = Operator::kUpdateRank;
+  static FillMode const kFillModeC = Operator::kFillModeC;
+
+  using OperatorArguments = typename Operator::Arguments;
+
+public:
+
+  /// Constructor
+  Rank2KOperation(char const *name = "unknown_rank_2k"): 
+    Rank2KOperationBase<Operator_>(name) {
+
+    this->description_.rank_k_kind = RankKKind::kUniversal;
+  }
+
+protected:
+
+  /// Constructs the arguments structure given the configuration and arguments
+  static Status construct_arguments_(
+    OperatorArguments &operator_args,
+    RankKConfiguration const *configuration) {
+
+    //operator_args.mode = configuration->mode;
+
+    operator_args.problem_size = configuration->problem_size;
+    operator_args.batch_count = configuration->batch_count;
+
+    operator_args.lda = int(configuration->lda);
+    operator_args.ldb = int(configuration->ldb);
+    operator_args.ldc = int(configuration->ldc);
+    operator_args.ldd = int(configuration->ldd);
+    
+    return Status::kSuccess;
+  }
+
+  /// Constructs the arguments structure given the configuration and arguments
+  static Status update_arguments_(
+    OperatorArguments &operator_args,
+    RankKArguments const *arguments) {
+    
+    if (arguments->pointer_mode == ScalarPointerMode::kHost) {
+      typename Operator::EpilogueOutputOp::Params params(
+        *static_cast<ElementCompute const *>(arguments->alpha),
+        *static_cast<ElementCompute const *>(arguments->beta)
+      );
+      operator_args.epilogue = params;
+    }
+    else if (arguments->pointer_mode == ScalarPointerMode::kDevice){
+      typename Operator::EpilogueOutputOp::Params params(
+        static_cast<ElementCompute const *>(arguments->alpha),
+        static_cast<ElementCompute const *>(arguments->beta)
+      );
+      operator_args.epilogue = params; 
+    }
+    else {
+      return Status::kErrorInvalidProblem;
+    }
+
+    // update arguments
+    operator_args.ptr_A = arguments->A;
+    operator_args.ptr_B = arguments->B;
+    operator_args.ptr_C = arguments->C;
+    operator_args.ptr_D = arguments->D;
+
+    operator_args.batch_stride_A = arguments->batch_stride_A;
+    operator_args.batch_stride_B = arguments->batch_stride_B;
+    operator_args.batch_stride_C = arguments->batch_stride_C;
+    operator_args.batch_stride_D = arguments->batch_stride_D;
+    
+    return Status::kSuccess;
+  }
+
+public:
+
+  /// Returns success if the operation can proceed
+  virtual Status can_implement(
+    void const *configuration_ptr, 
+    void const *arguments_ptr) const {
+    
+    RankKConfiguration const *configuration = 
+      static_cast<RankKConfiguration const *>(configuration_ptr);
+
+    RankKArguments const *arguments = 
+      static_cast<RankKArguments const *>(arguments_ptr);
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(args, configuration);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = update_arguments_(args, arguments);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return Operator::can_implement(args);
+  }
+  
+  /// Gets the host-side workspace
+  virtual uint64_t get_host_workspace_size(
+    void const *configuration) const {
+
+    return sizeof(Operator);
+  }
+  
+  /// Gets the device-side workspace
+  virtual uint64_t get_device_workspace_size(
+    void const *configuration_ptr,
+    void const *arguments_ptr = nullptr) const {
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(
+      args, 
+      static_cast<RankKConfiguration const *>(configuration_ptr));
+
+    if (status != Status::kSuccess) {
+      return 0;
+    }
+
+    uint64_t size = Operator::get_workspace_size(args);
+
+    return size;
+  }
+  
+  /// Initializes the workspace
+  virtual Status initialize(
+    void const *configuration_ptr, 
+    void *host_workspace, 
+    void *device_workspace, 
+    cudaStream_t stream = nullptr) const {
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(
+      args, 
+      static_cast<RankKConfiguration const *>(configuration_ptr));
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    Operator *op = new (host_workspace) Operator;
+    
+    //std::cout << "initialize() library::Rank2KOperation" << std::endl;
+    //print_operator_args(args);
+    status = op->initialize(args, device_workspace, stream);
+    
+    return status;
+  }
+
+  /// Runs the kernel
+  virtual Status run(
+    void const *arguments_ptr,
+    void *host_workspace, 
+    void *device_workspace = nullptr, 
+    cudaStream_t stream = nullptr) const {
+
+    OperatorArguments args;
+    
+    Status status = update_arguments_(
+      args, 
+      static_cast<RankKArguments const *>(arguments_ptr));
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+    
+    Operator *op = static_cast<Operator *>(host_workspace);
+    
+    status = op->update(args, device_workspace);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+    
+    //std::cout << "run() library::Rank2KOperation" << std::endl;
+    //print_operator_args(args);
+    status = op->run(stream);
+    
+    return status;
+  }
+
+  /// Call print_operator_args  from the Conv2dOperation::initialize()
+  // to dump arguments passed on to cutlass operator for debugging
+  void print_operator_args(OperatorArguments &operator_args) const {
+    std::cout << "Rank2KOperation::OperatorArguments" << std::endl
+              << "  problem_size:" << std::endl 
+              << operator_args.problem_size << std::endl
+              << "  epilouge (alpha, beta): "
+              << operator_args.epilogue.alpha << ", " 
+              << operator_args.epilogue.beta << std::endl
+              << "  ref_A (ptr, {stride}): " 
+              << operator_args.ptr_A << ", {"
+              << operator_args.lda << "}" << std::endl
+              << "  ref_B (ptr, {stride}): " 
+              << operator_args.ptr_B << ", {"
+              << operator_args.ldb << "}" << std::endl
+              << "  ref_C (ptr, {stride}): "
+              << operator_args.ptr_C << ", {"
+              << operator_args.ldc << "}" << std::endl
+              << "  ref_D (ptr, {stride}): "
+              << operator_args.ptr_D << ", {"
+              << operator_args.ldd << "}" << std::endl;
+  } 
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tools/library/src/rank_k_operation.h b/tools/library/src/rank_k_operation.h
new file mode 100644
index 00000000..270db571
--- /dev/null
+++ b/tools/library/src/rank_k_operation.h
@@ -0,0 +1,344 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Defines operations for all Rank K operation kinds (Syrk, Herk) 
+    in CUTLASS Library.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/device/rank_k.h"
+#include "cutlass/gemm/kernel/default_rank_k_universal.h"
+
+#include "cutlass/library/library.h"
+#include "library_internal.h"
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Operator_>
+class RankKOperationBase : public Operation {
+public:
+  using Operator = Operator_;
+  using ElementA = typename Operator::ElementA;
+  using LayoutA = typename Operator::LayoutA;
+  using ElementB = typename Operator::ElementA;
+  using LayoutB = typename Operator::LayoutA;
+  using ElementC = typename Operator::ElementC;
+  using LayoutC = typename Operator::LayoutC;
+  using ElementAccumulator = typename Operator::ElementAccumulator;
+  using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
+  static BlasMode const kBlasMode = Operator::kBlasMode;
+  static int const kUpdateRank = Operator::kUpdateRank;
+  static FillMode const kFillModeC = Operator::kFillModeC;
+
+  using OperatorArguments = typename Operator::Arguments;
+
+protected:
+
+  /// 
+  RankKDescription description_;
+
+public:
+
+  /// Constructor
+  RankKOperationBase(char const *name = "unknown_rank_k") {
+
+    description_.name = name;
+    description_.provider = Provider::kCUTLASS;
+    description_.rank_k_kind = RankKKind::kUniversal;
+    description_.fill_mode = kFillModeC;    
+    description_.blas_mode = kBlasMode;
+    description_.num_ranks = kUpdateRank;
+
+    description_.kind = OperationKind::kRankK;
+
+    description_.tile_description.threadblock_shape = make_Coord(
+      Operator::ThreadblockShape::kM,
+      Operator::ThreadblockShape::kN,
+      Operator::ThreadblockShape::kK);
+
+    description_.tile_description.threadblock_stages = Operator::kStages;
+
+    description_.tile_description.warp_count = make_Coord(
+      Operator::RankKkernel::WarpCount::kM,
+      Operator::RankKkernel::WarpCount::kN,
+      Operator::RankKkernel::WarpCount::kK);
+    
+    description_.tile_description.math_instruction.instruction_shape = make_Coord(
+      Operator::InstructionShape::kM,
+      Operator::InstructionShape::kN,
+      Operator::InstructionShape::kK);
+
+    description_.tile_description.math_instruction.element_accumulator = 
+      NumericTypeMap<ElementAccumulator>::kId;
+
+    description_.tile_description.math_instruction.opcode_class = 
+      OpcodeClassMap<typename Operator::OperatorClass>::kId;
+
+    description_.tile_description.math_instruction.math_operation =
+      MathOperationMap<typename Operator::Operator>::kId;
+
+    description_.tile_description.minimum_compute_capability = 
+      ArchMap<typename Operator::ArchTag, typename Operator::OperatorClass>::kMin;
+
+    description_.tile_description.maximum_compute_capability = 
+      ArchMap<typename Operator::ArchTag, typename Operator::OperatorClass>::kMax;
+    
+    description_.A = make_TensorDescription<ElementA, LayoutA>(Operator::kAlignmentA);
+    description_.B = make_TensorDescription<ElementA, LayoutA>(Operator::kAlignmentA);
+    description_.C = make_TensorDescription<ElementC, LayoutC>(Operator::kAlignmentC);
+    description_.element_epilogue = NumericTypeMap<ElementCompute>::kId;
+
+    description_.split_k_mode = SplitKMode::kNone;
+    description_.transform_A = ComplexTransformMap<Operator::kTransformA>::kId;
+    description_.transform_B = ComplexTransformMap<Operator::kTransformA>::kId;
+  }
+  
+  /// Returns the description of the SYRK operation
+  virtual OperationDescription const & description() const {
+    return description_;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Operator_>
+class RankKOperation : public RankKOperationBase<Operator_> {
+public:
+
+  using Operator = Operator_;
+  using ElementA = typename Operator::ElementA;
+  using LayoutA = typename Operator::LayoutA;
+  using ElementB = typename Operator::ElementA;
+  using LayoutB = typename Operator::LayoutA;
+  using ElementC = typename Operator::ElementC;
+  using LayoutC = typename Operator::LayoutC;
+
+  using ElementAccumulator = typename Operator::ElementAccumulator;
+  using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
+
+  static BlasMode const kBlasMode = Operator::kBlasMode;
+  static int const kUpdateRank = Operator::kUpdateRank;
+  static FillMode const kFillModeC = Operator::kFillModeC;
+
+  using OperatorArguments = typename Operator::Arguments;
+
+public:
+
+  /// Constructor
+  RankKOperation(char const *name = "unknown_rank_k"): 
+    RankKOperationBase<Operator_>(name) {
+
+    this->description_.rank_k_kind = RankKKind::kUniversal;
+  }
+
+protected:
+
+  /// Constructs the arguments structure given the configuration and arguments
+  static Status construct_arguments_(
+    OperatorArguments &operator_args,
+    RankKConfiguration const *configuration) {
+
+    //operator_args.mode = configuration->mode;
+
+    operator_args.problem_size = configuration->problem_size;
+    operator_args.batch_count = configuration->batch_count;
+
+    operator_args.lda = int(configuration->lda);
+    operator_args.ldb = int(configuration->lda);
+    operator_args.ldc = int(configuration->ldc);
+    operator_args.ldd = int(configuration->ldd);
+    
+    return Status::kSuccess;
+  }
+
+  /// Constructs the arguments structure given the configuration and arguments
+  static Status update_arguments_(
+    OperatorArguments &operator_args,
+    RankKArguments const *arguments) {
+    
+    if (arguments->pointer_mode == ScalarPointerMode::kHost) {
+      typename Operator::EpilogueOutputOp::Params params(
+        *static_cast<ElementCompute const *>(arguments->alpha),
+        *static_cast<ElementCompute const *>(arguments->beta)
+      );
+      operator_args.epilogue = params;
+    }
+    else if (arguments->pointer_mode == ScalarPointerMode::kDevice){
+      typename Operator::EpilogueOutputOp::Params params(
+        static_cast<ElementCompute const *>(arguments->alpha),
+        static_cast<ElementCompute const *>(arguments->beta)
+      );
+      operator_args.epilogue = params; 
+    }
+    else {
+      return Status::kErrorInvalidProblem;
+    }
+
+    // update arguments
+    operator_args.ptr_A = arguments->A;
+    operator_args.ptr_C = arguments->C;
+    operator_args.ptr_D = arguments->D;
+
+    operator_args.batch_stride_A = arguments->batch_stride_A;
+    operator_args.batch_stride_C = arguments->batch_stride_C;
+    operator_args.batch_stride_D = arguments->batch_stride_D;
+    
+    return Status::kSuccess;
+  }
+
+public:
+
+  /// Returns success if the operation can proceed
+  virtual Status can_implement(
+    void const *configuration_ptr, 
+    void const *arguments_ptr) const {
+    
+    RankKConfiguration const *configuration = 
+      static_cast<RankKConfiguration const *>(configuration_ptr);
+
+    RankKArguments const *arguments = 
+      static_cast<RankKArguments const *>(arguments_ptr);
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(args, configuration);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = update_arguments_(args, arguments);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return Operator::can_implement(args);
+  }
+  
+  /// Gets the host-side workspace
+  virtual uint64_t get_host_workspace_size(
+    void const *configuration) const {
+
+    return sizeof(Operator);
+  }
+  
+  /// Gets the device-side workspace
+  virtual uint64_t get_device_workspace_size(
+    void const *configuration_ptr,
+    void const *arguments_ptr = nullptr) const {
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(
+      args, 
+      static_cast<RankKConfiguration const *>(configuration_ptr));
+
+    if (status != Status::kSuccess) {
+      return 0;
+    }
+
+    uint64_t size = Operator::get_workspace_size(args);
+
+    return size;
+  }
+  
+  /// Initializes the workspace
+  virtual Status initialize(
+    void const *configuration_ptr, 
+    void *host_workspace, 
+    void *device_workspace, 
+    cudaStream_t stream = nullptr) const {
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(
+      args, 
+      static_cast<RankKConfiguration const *>(configuration_ptr));
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    Operator *op = new (host_workspace) Operator;
+
+    status = op->initialize(args, device_workspace, stream);
+    
+    return status;
+  }
+
+  /// Runs the kernel
+  virtual Status run(
+    void const *arguments_ptr,
+    void *host_workspace, 
+    void *device_workspace = nullptr, 
+    cudaStream_t stream = nullptr) const {
+
+    OperatorArguments args;
+    
+    Status status = update_arguments_(
+      args, 
+      static_cast<RankKArguments const *>(arguments_ptr));
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+    
+    Operator *op = static_cast<Operator *>(host_workspace);
+    
+    status = op->update(args, device_workspace);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+    
+    status = op->run(stream);
+    
+    return status;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tools/library/src/reduction/init_reduction_operations.cu b/tools/library/src/reduction/init_reduction_operations.cu
index 9b49a1f1..21b95818 100644
--- a/tools/library/src/reduction/init_reduction_operations.cu
+++ b/tools/library/src/reduction/init_reduction_operations.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2022, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -38,6 +44,7 @@ namespace library {
 ///////////////////////////////////////////////////////////////////////////////////////////////
 void initialize_reduce_add_linear_combination_f32_f32_f16(Manifest &manifest);
 void initialize_reduce_add_linear_combination_f32_f32_f32(Manifest &manifest);
+void initialize_reduce_add_linear_combination_f64_f64_f64(Manifest &manifest);
 void initialize_reduce_add_linear_combination_cf32_cf32_cf32(Manifest &manifest);
 
 //
@@ -47,6 +54,7 @@ void initialize_all_reduction_op(Manifest &manifest) {
 
   initialize_reduce_add_linear_combination_f32_f32_f16(manifest);
   initialize_reduce_add_linear_combination_f32_f32_f32(manifest);
+  initialize_reduce_add_linear_combination_f64_f64_f64(manifest);
   initialize_reduce_add_linear_combination_cf32_cf32_cf32(manifest);
 
 }
diff --git a/tools/library/src/reduction/reduction_device.cu b/tools/library/src/reduction/reduction_device.cu
index 5c3776c0..7283a6ba 100644
--- a/tools/library/src/reduction/reduction_device.cu
+++ b/tools/library/src/reduction/reduction_device.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2022, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -106,6 +112,40 @@ void initialize_reduce_add_linear_combination_f32_f32_f32(Manifest &manifest) {
   ));
 }
 
+void initialize_reduce_add_linear_combination_f64_f64_f64(Manifest &manifest) {
+
+  using ElementWorkspace = double; 
+  using ElementAccumulator = double;
+  using ElementOutput = double;
+  using ElementCompute = double;
+
+  using EpilogueOutputOp = cutlass::epilogue::thread::LinearCombination<
+    ElementOutput,
+    128 / cutlass::sizeof_bits<ElementWorkspace>::value,
+    ElementAccumulator,
+    ElementCompute
+  >;
+
+  using ReductionOp = cutlass::reduction::thread::ReduceAdd<
+    ElementAccumulator, 
+    typename EpilogueOutputOp::ElementAccumulator,
+    EpilogueOutputOp::kCount
+  >;
+
+  using Operation_reduce_add_linear_combination_f64_f64_f64 = cutlass::reduction::device::ReduceSplitK<
+    cutlass::reduction::kernel::ReduceSplitK<
+      cutlass::MatrixShape<4, 32 * EpilogueOutputOp::kCount>,
+      EpilogueOutputOp,
+      ReductionOp
+    >
+  >;
+
+  manifest.append(new ReductionOperation<
+    Operation_reduce_add_linear_combination_f64_f64_f64>(
+      "reduce_add_linear_combination_f64_f64_f64"
+  ));
+}
+
 void initialize_reduce_add_linear_combination_cf32_cf32_cf32(Manifest &manifest) {
 
   using ElementWorkspace = cutlass::complex<float>; 
diff --git a/tools/library/src/reduction/reduction_operation.h b/tools/library/src/reduction/reduction_operation.h
index 03996c91..d04b0f31 100644
--- a/tools/library/src/reduction/reduction_operation.h
+++ b/tools/library/src/reduction/reduction_operation.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/library/src/reference/conv2d.cu b/tools/library/src/reference/conv2d.cu
index 58c672d7..87fc92d4 100644
--- a/tools/library/src/reference/conv2d.cu
+++ b/tools/library/src/reference/conv2d.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/library/src/reference/conv3d.cu b/tools/library/src/reference/conv3d.cu
index 8645aef2..92ca302a 100644
--- a/tools/library/src/reference/conv3d.cu
+++ b/tools/library/src/reference/conv3d.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/library/src/reference/conv_reference_operation.h b/tools/library/src/reference/conv_reference_operation.h
index 9837ed07..d9dfc2bf 100644
--- a/tools/library/src/reference/conv_reference_operation.h
+++ b/tools/library/src/reference/conv_reference_operation.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2022, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/library/src/reference/gemm.cu b/tools/library/src/reference/gemm.cu
index 3434ef19..19181a64 100644
--- a/tools/library/src/reference/gemm.cu
+++ b/tools/library/src/reference/gemm.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/library/src/reference/gemm_reference_operation.h b/tools/library/src/reference/gemm_reference_operation.h
index 3acf5817..b6c45ebc 100644
--- a/tools/library/src/reference/gemm_reference_operation.h
+++ b/tools/library/src/reference/gemm_reference_operation.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2022, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/library/src/reference/initialize_reference_operations.cu b/tools/library/src/reference/initialize_reference_operations.cu
index fbe806a9..a8e34e6f 100644
--- a/tools/library/src/reference/initialize_reference_operations.cu
+++ b/tools/library/src/reference/initialize_reference_operations.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/library/src/singleton.cu b/tools/library/src/singleton.cu
index a921f7c6..23ad294f 100644
--- a/tools/library/src/singleton.cu
+++ b/tools/library/src/singleton.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -60,4 +66,3 @@ Singleton const & Singleton::get() {
 } // namespace cutlass
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
-
diff --git a/tools/library/src/symm_operation.h b/tools/library/src/symm_operation.h
new file mode 100644
index 00000000..f0e8db17
--- /dev/null
+++ b/tools/library/src/symm_operation.h
@@ -0,0 +1,379 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Defines operations for all Symm operation kinds (Symm, Hemm) 
+    in CUTLASS Library.
+
+  
+*/
+
+#pragma once
+#include <iostream>
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/device/symm.h"
+#include "cutlass/gemm/kernel/default_symm_universal.h"
+
+#include "cutlass/library/library.h"
+#include "library_internal.h"
+#include "cutlass/core_io.h"
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Operator_>
+class SymmOperationBase : public Operation {
+public:
+  using Operator = Operator_;
+  using ElementA = typename Operator::ElementA;
+  using LayoutA = typename Operator::LayoutA;
+  using ElementB = typename Operator::ElementB;
+  using LayoutB = typename Operator::LayoutB;
+  using ElementC = typename Operator::ElementC;
+  using LayoutC = typename Operator::LayoutC;
+  using ElementAccumulator = typename Operator::ElementAccumulator;
+  using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
+  static BlasMode const kBlasMode = Operator::kBlasMode;
+  static SideMode const kSideModeA = Operator::kSideModeA;
+  static FillMode const kFillModeA = Operator::kFillModeA;
+
+  using OperatorArguments = typename Operator::Arguments;
+
+protected:
+
+  /// 
+  SymmDescription description_;
+
+public:
+
+  /// Constructor
+  SymmOperationBase(char const *name = "unknown_symm") {
+
+    description_.name = name;
+    description_.provider = Provider::kCUTLASS;
+    description_.symm_kind = SymmKind::kUniversal;
+    description_.side_mode = kSideModeA;    
+    description_.fill_mode = kFillModeA;    
+    description_.blas_mode = kBlasMode;
+
+    description_.kind = OperationKind::kSymm;
+
+    description_.tile_description.threadblock_shape = make_Coord(
+      Operator::ThreadblockShape::kM,
+      Operator::ThreadblockShape::kN,
+      Operator::ThreadblockShape::kK);
+
+    description_.tile_description.threadblock_stages = Operator::kStages;
+
+    description_.tile_description.warp_count = make_Coord(
+      Operator::SymmKernel::WarpCount::kM,
+      Operator::SymmKernel::WarpCount::kN,
+      Operator::SymmKernel::WarpCount::kK);
+    
+    description_.tile_description.math_instruction.instruction_shape = make_Coord(
+      Operator::InstructionShape::kM,
+      Operator::InstructionShape::kN,
+      Operator::InstructionShape::kK);
+
+    description_.tile_description.math_instruction.element_accumulator = 
+      NumericTypeMap<ElementAccumulator>::kId;
+
+    description_.tile_description.math_instruction.opcode_class = 
+      OpcodeClassMap<typename Operator::OperatorClass>::kId;
+
+    description_.tile_description.math_instruction.math_operation =
+      MathOperationMap<typename Operator::Operator>::kId;
+
+    description_.tile_description.minimum_compute_capability = 
+      ArchMap<typename Operator::ArchTag, typename Operator::OperatorClass>::kMin;
+
+    description_.tile_description.maximum_compute_capability = 
+      ArchMap<typename Operator::ArchTag, typename Operator::OperatorClass>::kMax;
+    
+    description_.A = make_TensorDescription<ElementA, LayoutA>(Operator::kAlignmentA);
+    description_.B = make_TensorDescription<ElementB, LayoutB>(Operator::kAlignmentB);
+    description_.C = make_TensorDescription<ElementC, LayoutC>(Operator::kAlignmentC);
+    description_.element_epilogue = NumericTypeMap<ElementCompute>::kId;
+
+    description_.split_k_mode = SplitKMode::kNone;
+  }
+  
+  /// Returns the description of the SYMM operation
+  virtual OperationDescription const & description() const {
+    return description_;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Operator_>
+class SymmOperation : public SymmOperationBase<Operator_> {
+public:
+
+  using Operator = Operator_;
+  using ElementA = typename Operator::ElementA;
+  using LayoutA = typename Operator::LayoutA;
+  using ElementB = typename Operator::ElementB;
+  using LayoutB = typename Operator::LayoutB;
+  using ElementC = typename Operator::ElementC;
+  using LayoutC = typename Operator::LayoutC;
+
+  using ElementAccumulator = typename Operator::ElementAccumulator;
+  using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
+
+  static BlasMode const kBlasMode = Operator::kBlasMode;
+  static SideMode const kSideModeA = Operator::kSideModeA;
+  static FillMode const kFillModeA = Operator::kFillModeA;
+
+  using OperatorArguments = typename Operator::Arguments;
+
+public:
+
+  /// Constructor
+  SymmOperation(char const *name = "unknown_symm"): 
+    SymmOperationBase<Operator_>(name) {
+
+    this->description_.symm_kind = SymmKind::kUniversal;
+  }
+
+protected:
+
+  /// Constructs the arguments structure given the configuration and arguments
+  static Status construct_arguments_(
+    OperatorArguments &operator_args,
+    SymmConfiguration const *configuration) {
+
+    //operator_args.mode = configuration->mode;
+
+    operator_args.problem_size = configuration->problem_size;
+    operator_args.batch_count = configuration->batch_count;
+
+    operator_args.lda = int(configuration->lda);
+    operator_args.ldb = int(configuration->ldb);
+    operator_args.ldc = int(configuration->ldc);
+    operator_args.ldd = int(configuration->ldd);
+    
+    return Status::kSuccess;
+  }
+
+  /// Constructs the arguments structure given the configuration and arguments
+  static Status update_arguments_(
+    OperatorArguments &operator_args,
+    SymmArguments const *arguments) {
+    
+    if (arguments->pointer_mode == ScalarPointerMode::kHost) {
+      typename Operator::EpilogueOutputOp::Params params(
+        *static_cast<ElementCompute const *>(arguments->alpha),
+        *static_cast<ElementCompute const *>(arguments->beta)
+      );
+      operator_args.epilogue = params;
+    }
+    else if (arguments->pointer_mode == ScalarPointerMode::kDevice){
+      typename Operator::EpilogueOutputOp::Params params(
+        static_cast<ElementCompute const *>(arguments->alpha),
+        static_cast<ElementCompute const *>(arguments->beta)
+      );
+      operator_args.epilogue = params; 
+    }
+    else {
+      return Status::kErrorInvalidProblem;
+    }
+
+    // update arguments
+    operator_args.ptr_A = arguments->A;
+    operator_args.ptr_B = arguments->B;
+    operator_args.ptr_C = arguments->C;
+    operator_args.ptr_D = arguments->D;
+
+    operator_args.batch_stride_A = arguments->batch_stride_A;
+    operator_args.batch_stride_B = arguments->batch_stride_B;
+    operator_args.batch_stride_C = arguments->batch_stride_C;
+    operator_args.batch_stride_D = arguments->batch_stride_D;
+    
+    return Status::kSuccess;
+  }
+
+public:
+
+  /// Returns success if the operation can proceed
+  virtual Status can_implement(
+    void const *configuration_ptr, 
+    void const *arguments_ptr) const {
+    
+    SymmConfiguration const *configuration = 
+      static_cast<SymmConfiguration const *>(configuration_ptr);
+
+    SymmArguments const *arguments = 
+      static_cast<SymmArguments const *>(arguments_ptr);
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(args, configuration);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = update_arguments_(args, arguments);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return Operator::can_implement(args);
+  }
+  
+  /// Gets the host-side workspace
+  virtual uint64_t get_host_workspace_size(
+    void const *configuration) const {
+
+    return sizeof(Operator);
+  }
+  
+  /// Gets the device-side workspace
+  virtual uint64_t get_device_workspace_size(
+    void const *configuration_ptr,
+    void const *arguments_ptr = nullptr) const {
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(
+      args, 
+      static_cast<SymmConfiguration const *>(configuration_ptr));
+
+    if (status != Status::kSuccess) {
+      return 0;
+    }
+
+    uint64_t size = Operator::get_workspace_size(args);
+
+    return size;
+  }
+  
+  /// Initializes the workspace
+  virtual Status initialize(
+    void const *configuration_ptr, 
+    void *host_workspace, 
+    void *device_workspace, 
+    cudaStream_t stream = nullptr) const {
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(
+      args, 
+      static_cast<SymmConfiguration const *>(configuration_ptr));
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    Operator *op = new (host_workspace) Operator;
+    
+    //std::cout << "initialize() library::SymmOperation" << std::endl;
+    //print_operator_args(args);
+    status = op->initialize(args, device_workspace, stream);
+    
+    return status;
+  }
+
+  /// Runs the kernel
+  virtual Status run(
+    void const *arguments_ptr,
+    void *host_workspace, 
+    void *device_workspace = nullptr, 
+    cudaStream_t stream = nullptr) const {
+
+    OperatorArguments args;
+    
+    Status status = update_arguments_(
+      args, 
+      static_cast<SymmArguments const *>(arguments_ptr));
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+    
+    Operator *op = static_cast<Operator *>(host_workspace);
+
+    bool need_swapped_matrices = (kSideModeA == SideMode::kLeft && 
+                                    std::is_same<typename Operator::LayoutC, layout::ColumnMajor>::value) ||
+                                 (kSideModeA == SideMode::kRight &&
+                                    std::is_same<typename Operator::LayoutC, layout::RowMajor>::value);
+    if (need_swapped_matrices) {
+      status = op->update(args.swapped_matrices(), device_workspace);
+    } else {
+      status = op->update(args, device_workspace);
+    } 
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+    
+    //std::cout << "run() library::SymmOperation" << std::endl;
+    //print_operator_args(args);
+    status = op->run(stream);
+    
+    return status;
+  }
+
+  /// Call print_operator_args  from the Conv2dOperation::initialize()
+  // to dump arguments passed on to cutlass operator for debugging
+  void print_operator_args(OperatorArguments &operator_args) const {
+    std::cout << "SymmOperation::OperatorArguments" << std::endl
+              << "  problem_size:" << std::endl 
+              << operator_args.problem_size << std::endl
+              << "  epilouge (alpha, beta): "
+              << operator_args.epilogue.alpha << ", " 
+              << operator_args.epilogue.beta << std::endl
+              << "  ref_A (ptr, {stride}): " 
+              << operator_args.ptr_A << ", {"
+              << operator_args.lda << "}" << std::endl
+              << "  ref_B (ptr, {stride}): " 
+              << operator_args.ptr_B << ", {"
+              << operator_args.ldb << "}" << std::endl
+              << "  ref_C (ptr, {stride}): "
+              << operator_args.ptr_C << ", {"
+              << operator_args.ldc << "}" << std::endl
+              << "  ref_D (ptr, {stride}): "
+              << operator_args.ptr_D << ", {"
+              << operator_args.ldd << "}" << std::endl;
+  } 
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tools/library/src/trmm_operation.h b/tools/library/src/trmm_operation.h
new file mode 100644
index 00000000..8595bb57
--- /dev/null
+++ b/tools/library/src/trmm_operation.h
@@ -0,0 +1,346 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Defines operations for all TRMM operation kinds in CUTLASS Library.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/device/trmm.h"
+#include "cutlass/gemm/kernel/default_trmm_universal.h"
+#include "cutlass/gemm/kernel/trmm_universal.h"
+
+#include "cutlass/library/library.h"
+#include "library_internal.h"
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Operator_>
+class TrmmOperationBase : public Operation {
+public:
+  using Operator = Operator_;
+  using ElementA = typename Operator::ElementA;
+  using LayoutA = typename Operator::LayoutA;
+  static SideMode const kSideMode = Operator::kSideMode;
+  static FillMode const kFillMode = Operator::kFillMode;
+  static DiagType const kDiagType = Operator::kDiagType;
+  using ElementB = typename Operator::ElementB;
+  using LayoutB = typename Operator::LayoutB;
+  using ElementC = typename Operator::ElementC;
+  using LayoutC = typename Operator::LayoutC;
+  using ElementAccumulator = typename Operator::ElementAccumulator;
+  using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
+
+  using OperatorArguments = typename Operator::Arguments;
+
+protected:
+
+  /// 
+  TrmmDescription description_;
+
+public:
+
+  /// Constructor
+  TrmmOperationBase(char const *name = "unknown_trmm") {
+
+    description_.name = name;
+    description_.provider = Provider::kCUTLASS;
+    description_.kind = OperationKind::kTrmm;
+    description_.trmm_kind = TrmmKind::kUniversal;
+    description_.side_mode = kSideMode;    
+    description_.fill_mode = kFillMode;    
+    description_.diag_type = kDiagType;    
+
+    description_.tile_description.threadblock_shape = make_Coord(
+      Operator::ThreadblockShape::kM,
+      Operator::ThreadblockShape::kN,
+      Operator::ThreadblockShape::kK);
+
+    description_.tile_description.threadblock_stages = Operator::kStages;
+
+    description_.tile_description.warp_count = make_Coord(
+      Operator::TrmmKernel::WarpCount::kM,
+      Operator::TrmmKernel::WarpCount::kN,
+      Operator::TrmmKernel::WarpCount::kK);
+    
+    description_.tile_description.math_instruction.instruction_shape = make_Coord(
+      Operator::InstructionShape::kM,
+      Operator::InstructionShape::kN,
+      Operator::InstructionShape::kK);
+
+    description_.tile_description.math_instruction.element_accumulator = 
+      NumericTypeMap<ElementAccumulator>::kId;
+
+    description_.tile_description.math_instruction.opcode_class = 
+      OpcodeClassMap<typename Operator::OperatorClass>::kId;
+
+    description_.tile_description.math_instruction.math_operation =
+      MathOperationMap<typename Operator::Operator>::kId;
+
+    description_.tile_description.minimum_compute_capability = 
+      ArchMap<typename Operator::ArchTag, typename Operator::OperatorClass>::kMin;
+
+    description_.tile_description.maximum_compute_capability = 
+      ArchMap<typename Operator::ArchTag, typename Operator::OperatorClass>::kMax;
+    
+    description_.A = make_TensorDescription<ElementA, LayoutA>(Operator::kAlignmentA);
+    description_.B = make_TensorDescription<ElementB, LayoutB>(Operator::kAlignmentB);
+    description_.D = make_TensorDescription<ElementC, LayoutC>(Operator::kAlignmentC);
+    description_.element_epilogue = NumericTypeMap<ElementCompute>::kId;
+
+    description_.split_k_mode = SplitKMode::kNone;
+    description_.transform_A = ComplexTransformMap<Operator::kTransformA>::kId;
+  }
+  
+  /// Returns the description of the TRMM operation
+  virtual OperationDescription const & description() const {
+    return description_;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Operator_>
+class TrmmOperation : public TrmmOperationBase<Operator_> {
+public:
+
+  using Operator = Operator_;
+  using ElementA = typename Operator::ElementA;
+  using LayoutA = typename Operator::LayoutA;
+  static SideMode const kSideMode = Operator::kSideMode;
+  static FillMode const kFillMode = Operator::kFillMode;
+  static DiagType const kDiagType = Operator::kDiagType;
+  using ElementB = typename Operator::ElementB;
+  using LayoutB = typename Operator::LayoutB;
+  using ElementC = typename Operator::ElementC;
+  using LayoutC = typename Operator::LayoutC;
+  using ElementAccumulator = typename Operator::ElementAccumulator;
+  using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
+
+  using OperatorArguments = typename Operator::Arguments;
+
+public:
+
+  /// Constructor
+  TrmmOperation(char const *name = "unknown_trmm"): 
+    TrmmOperationBase<Operator_>(name) {
+
+    this->description_.trmm_kind = TrmmKind::kUniversal;
+  }
+
+protected:
+
+  /// Constructs the arguments structure given the configuration and arguments
+  static Status construct_arguments_(
+    OperatorArguments &operator_args,
+    TrmmConfiguration const *configuration) {
+
+    //operator_args.mode = configuration->mode;
+
+    operator_args.problem_size = configuration->problem_size;
+    operator_args.batch_count = configuration->batch_count;
+
+    operator_args.lda = int(configuration->lda);
+    operator_args.ldb = int(configuration->ldb);
+    operator_args.ldd = int(configuration->ldd);
+    
+    return Status::kSuccess;
+  }
+
+  /// Constructs the arguments structure given the configuration and arguments
+  static Status update_arguments_(
+    OperatorArguments &operator_args,
+    TrmmArguments const *arguments) {
+    
+    if (arguments->pointer_mode == ScalarPointerMode::kHost) {
+      typename Operator::EpilogueOutputOp::Params params(
+        *static_cast<ElementCompute const *>(arguments->alpha),
+        *static_cast<ElementCompute const *>(arguments->beta)
+      );
+      operator_args.epilogue = params;
+    }
+    else if (arguments->pointer_mode == ScalarPointerMode::kDevice){
+      typename Operator::EpilogueOutputOp::Params params(
+        static_cast<ElementCompute const *>(arguments->alpha),
+        static_cast<ElementCompute const *>(arguments->beta)
+      );
+      operator_args.epilogue = params; 
+    }
+    else {
+      return Status::kErrorInvalidProblem;
+    }
+
+    // update arguments
+    operator_args.ptr_A = arguments->A;
+    operator_args.ptr_B = arguments->B;
+    operator_args.batch_stride_A = arguments->batch_stride_A;
+    operator_args.batch_stride_B = arguments->batch_stride_B;
+    operator_args.ptr_D = arguments->D;
+    operator_args.batch_stride_D = arguments->batch_stride_D;
+
+    return Status::kSuccess;
+  }
+
+public:
+
+  /// Returns success if the operation can proceed
+  virtual Status can_implement(
+    void const *configuration_ptr, 
+    void const *arguments_ptr) const {
+    
+    TrmmConfiguration const *configuration = 
+      static_cast<TrmmConfiguration const *>(configuration_ptr);
+
+    TrmmArguments const *arguments = 
+      static_cast<TrmmArguments const *>(arguments_ptr);
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(args, configuration);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = update_arguments_(args, arguments);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return Operator::can_implement(args);
+  }
+  
+  /// Gets the host-side workspace
+  virtual uint64_t get_host_workspace_size(
+    void const *configuration) const {
+
+    return sizeof(Operator);
+  }
+  
+  /// Gets the device-side workspace
+  virtual uint64_t get_device_workspace_size(
+    void const *configuration_ptr,
+    void const *arguments_ptr = nullptr) const {
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(
+      args, 
+      static_cast<TrmmConfiguration const *>(configuration_ptr));
+
+    if (status != Status::kSuccess) {
+      return 0;
+    }
+
+    uint64_t size = Operator::get_workspace_size(args);
+
+    return size;
+  }
+  
+  /// Initializes the workspace
+  virtual Status initialize(
+    void const *configuration_ptr, 
+    void *host_workspace, 
+    void *device_workspace, 
+    cudaStream_t stream = nullptr) const {
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(
+      args, 
+      static_cast<TrmmConfiguration const *>(configuration_ptr));
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    Operator *op = new (host_workspace) Operator;
+
+    status = op->initialize(args, device_workspace, stream);
+    
+    return status;
+  }
+
+  /// Runs the kernel
+  virtual Status run(
+    void const *arguments_ptr,
+    void *host_workspace, 
+    void *device_workspace = nullptr, 
+    cudaStream_t stream = nullptr) const {
+
+    OperatorArguments args;
+    
+    Status status = update_arguments_(
+      args, 
+      static_cast<TrmmArguments const *>(arguments_ptr));
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+    
+    Operator *op = static_cast<Operator *>(host_workspace);
+   
+    bool need_swapped_matrices = (kSideMode == SideMode::kLeft && 
+                                    std::is_same<typename Operator::LayoutC, layout::ColumnMajor>::value) ||
+                                 (kSideMode == SideMode::kRight &&
+                                    std::is_same<typename Operator::LayoutC, layout::RowMajor>::value);
+    if (need_swapped_matrices) {
+      status = op->update(args.swapped_matrices(), device_workspace);
+    } else {
+      status = op->update(args, device_workspace);
+    } 
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+    
+    status = op->run(stream);
+    
+    return status;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tools/library/src/util.cu b/tools/library/src/util.cu
index a9cef1c5..4636fd36 100644
--- a/tools/library/src/util.cu
+++ b/tools/library/src/util.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -28,6 +34,8 @@
 #include "cutlass/cutlass.h"
 #include "cutlass/numeric_types.h"
 #include "cutlass/complex.h"
+#include "cutlass/blas3.h"
+
 #include "cutlass/layout/matrix.h"
 
 #include "cutlass/library/library.h"
@@ -97,6 +105,7 @@ GemmKind_enumerants[] = {
   {"universal", "<Universal>", GemmKind::kUniversal},
   {"planar_complex", "<PlanarComplex>", GemmKind::kPlanarComplex},
   {"planar_complex_array", "<PlanarComplexArray>", GemmKind::kPlanarComplexArray},
+  {"grouped", "<Grouped>", GemmKind::kGrouped},
 };
 
 /// Converts a GemmKind enumerant to a string
@@ -118,6 +127,203 @@ char const *to_string(GemmKind type, bool pretty) {
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
+static struct {
+  char const *text;
+  char const *pretty;
+  RankKKind enumerant;
+}
+RankKKind_enumerants[] = {
+  {"universal", "<Universal>", RankKKind::kUniversal},
+};
+
+/// Converts a SyrkKind enumerant to a string
+char const *to_string(RankKKind type, bool pretty) {
+
+  for (auto const & possible :RankKKind_enumerants) {
+    if (type == possible.enumerant) {
+      if (pretty) {
+        return possible.pretty;
+      }
+      else {
+        return possible.text;
+      }
+    }
+  }
+
+  return pretty ? "Invalid" : "invalid";
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+static struct {
+  char const *text;
+  char const *pretty;
+  TrmmKind enumerant;
+}
+TrmmKind_enumerants[] = {
+  {"universal", "<Universal>", TrmmKind::kUniversal},
+};
+
+/// Converts a TrmmKind enumerant to a string
+char const *to_string(TrmmKind type, bool pretty) {
+
+  for (auto const & possible :TrmmKind_enumerants) {
+    if (type == possible.enumerant) {
+      if (pretty) {
+        return possible.pretty;
+      }
+      else {
+        return possible.text;
+      }
+    }
+  }
+
+  return pretty ? "Invalid" : "invalid";
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+static struct {
+  char const *text;
+  char const *pretty;
+  SymmKind enumerant;
+}
+SymmKind_enumerants[] = {
+  {"universal", "<Universal>", SymmKind::kUniversal},
+};
+
+/// Converts a SymmKind enumerant to a string
+char const *to_string(SymmKind type, bool pretty) {
+
+  for (auto const & possible :SymmKind_enumerants) {
+    if (type == possible.enumerant) {
+      if (pretty) {
+        return possible.pretty;
+      }
+      else {
+        return possible.text;
+      }
+    }
+  }
+
+  return pretty ? "Invalid" : "invalid";
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+static struct {
+  char const *text;
+  char const *pretty;
+  SideMode enumerant;
+}
+SideMode_enumerants[] = {
+  {"left", "Left", SideMode::kLeft},
+  {"right", "Right", SideMode::kRight}
+};
+
+/// Converts a SideMode enumerant to a string
+char const *to_string(SideMode type, bool pretty) {
+
+  for (auto const & possible :SideMode_enumerants) {
+    if (type == possible.enumerant) {
+      if (pretty) {
+        return possible.pretty;
+      }
+      else {
+        return possible.text;
+      }
+    }
+  }
+
+  return pretty ? "Invalid" : "invalid";
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+static struct {
+  char const *text;
+  char const *pretty;
+  FillMode enumerant;
+}
+FillMode_enumerants[] = {
+  {"lower", "Lower", FillMode::kLower},
+  {"upper", "Upper", FillMode::kUpper}
+};
+
+/// Converts a FillMode enumerant to a string
+char const *to_string(FillMode type, bool pretty) {
+
+  for (auto const & possible :FillMode_enumerants) {
+    if (type == possible.enumerant) {
+      if (pretty) {
+        return possible.pretty;
+      }
+      else {
+        return possible.text;
+      }
+    }
+  }
+
+  return pretty ? "Invalid" : "invalid";
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+static struct {
+  char const *text;
+  char const *pretty;
+  BlasMode enumerant;
+}
+BlasMode_enumerants[] = {
+  {"symmetric", "Symmetric", BlasMode::kSymmetric},
+  {"hermitian", "Hermitian", BlasMode::kHermitian}
+};
+
+/// Converts a BlasMode enumerant to a string
+char const *to_string(BlasMode type, bool pretty) {
+
+  for (auto const & possible :BlasMode_enumerants) {
+    if (type == possible.enumerant) {
+      if (pretty) {
+        return possible.pretty;
+      }
+      else {
+        return possible.text;
+      }
+    }
+  }
+
+  return pretty ? "Invalid" : "invalid";
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+static struct {
+  char const *text;
+  char const *pretty;
+  DiagType enumerant;
+}
+DiagType_enumerants[] = {
+  {"nonunit", "NonUnit", DiagType::kNonUnit},
+  {"unit", "Unit", DiagType::kUnit}
+};
+
+/// Converts a DiagType enumerant to a string
+char const *to_string(DiagType type, bool pretty) {
+
+  for (auto const & possible :DiagType_enumerants) {
+    if (type == possible.enumerant) {
+      if (pretty) {
+        return possible.pretty;
+      }
+      else {
+        return possible.text;
+      }
+    }
+  }
+
+  return pretty ? "Invalid" : "invalid";
+}
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 static struct {
@@ -128,6 +334,10 @@ static struct {
 OperationKind_enumerants[] = {
   {"eq_gemm", "EqGemm", OperationKind::kEqGemm}, 
   {"gemm", "Gemm", OperationKind::kGemm},               
+  {"rank_k", "RankK", OperationKind::kRankK},
+  {"rank_2k", "Rank2K", OperationKind::kRank2K},
+  {"trmm", "Trmm", OperationKind::kTrmm},
+  {"symm", "Symm", OperationKind::kSymm},
   {"conv2d", "Conv2d", OperationKind::kConv2d},           
   {"conv3d", "Conv3d", OperationKind::kConv3d},           
   {"spgemm", "SparseGemm", OperationKind::kSparseGemm},
@@ -706,6 +916,8 @@ IteratorAlgorithmID_enumerants[] = {
   {"none", "<none>", IteratorAlgorithmID::kNone},
   {"analytic", "<analytic>", IteratorAlgorithmID::kAnalytic},
   {"optimized", "<optimized>", IteratorAlgorithmID::kOptimized},
+  {"fixed_channels", "<fixed_channels>", IteratorAlgorithmID::kFixedChannels},
+  {"few_channels", "<few_channels>", IteratorAlgorithmID::kFewChannels},
 };
 
 /// Converts a ConvModeID enumerant to a string
diff --git a/tools/profiler/CMakeLists.txt b/tools/profiler/CMakeLists.txt
index ae922168..c619b157 100644
--- a/tools/profiler/CMakeLists.txt
+++ b/tools/profiler/CMakeLists.txt
@@ -1,23 +1,29 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #
@@ -38,6 +44,10 @@ set(CUTLASS_TOOLS_PROFILER_SOURCES
   src/problem_space.cpp
   src/operation_profiler.cu
   src/gemm_operation_profiler.cu
+  src/rank_k_operation_profiler.cu
+  src/rank_2k_operation_profiler.cu
+  src/trmm_operation_profiler.cu
+  src/symm_operation_profiler.cu
   src/conv2d_operation_profiler.cu          
   src/conv3d_operation_profiler.cu          
   src/sparse_gemm_operation_profiler.cu
@@ -88,6 +98,11 @@ set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_GEMM   --operation=Gemm       --provid
 set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_CONV2D --operation=Conv2d     --providers=cutlass --verification-providers=cudnn,device       --junit-output=test_cutlass_profiler_conv2d)
 set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_CONV3D --operation=Conv3d     --providers=cutlass --verification-providers=cudnn,device,host  --junit-output=test_cutlass_profiler_conv3d)
 set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_SPGEMM --operation=SparseGemm --providers=cutlass --verification-providers=cublas,device,host --junit-output=test_cutlass_profiler_spgemm)
+set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_RANK_K   --operation=RankK       --providers=cutlass --verification-providers=cublas        --junit-output=test_cutlass_profiler_rank_k)
+set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_RANK_2K   --operation=Rank2K       --providers=cutlass --verification-providers=cublas        --junit-output=test_cutlass_profiler_rank_2k)
+set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_TRMM   --operation=Trmm       --providers=cutlass --verification-providers=device,host        --junit-output=test_cutlass_profiler_trmm)
+set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_SYMM   --operation=Symm       --providers=cutlass --verification-providers=cublas,host        --junit-output=test_cutlass_profiler_symm)
+
 cutlass_add_executable_tests(
   test_profiler cutlass_profiler
   DEPENDEES test_all
@@ -96,5 +111,9 @@ cutlass_add_executable_tests(
     CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_CONV2D
     CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_CONV3D
     CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_SPGEMM
+    CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_RANK_K
+    CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_RANK_2K
+    CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_TRMM
+    CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_SYMM
   DISABLE_EXECUTABLE_INSTALL_RULE
   )
diff --git a/tools/profiler/src/conv2d_operation_profiler.cu b/tools/profiler/src/conv2d_operation_profiler.cu
index 4a398886..95ebd09d 100644
--- a/tools/profiler/src/conv2d_operation_profiler.cu
+++ b/tools/profiler/src/conv2d_operation_profiler.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/profiler/src/conv2d_operation_profiler.h b/tools/profiler/src/conv2d_operation_profiler.h
index 906408e4..39eead34 100644
--- a/tools/profiler/src/conv2d_operation_profiler.h
+++ b/tools/profiler/src/conv2d_operation_profiler.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/profiler/src/conv3d_operation_profiler.cu b/tools/profiler/src/conv3d_operation_profiler.cu
index 4cbff7d3..b386bdcb 100644
--- a/tools/profiler/src/conv3d_operation_profiler.cu
+++ b/tools/profiler/src/conv3d_operation_profiler.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/profiler/src/conv3d_operation_profiler.h b/tools/profiler/src/conv3d_operation_profiler.h
index c5fee95c..e137b692 100644
--- a/tools/profiler/src/conv3d_operation_profiler.h
+++ b/tools/profiler/src/conv3d_operation_profiler.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/profiler/src/cublas_helpers.cu b/tools/profiler/src/cublas_helpers.cu
index f1e481a8..aceebf6d 100644
--- a/tools/profiler/src/cublas_helpers.cu
+++ b/tools/profiler/src/cublas_helpers.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -170,6 +176,54 @@ bool get_cublas_datatype(cublasDataType_t &data_type, library::NumericTypeID ele
   return false;
 }
 
+/// Maps a cutlass::SideMode to cuBLAS side mode
+bool get_cublas_side_mode(cublasSideMode_t& side, SideMode side_mode) {
+
+  switch (side_mode) {
+    case SideMode::kLeft: 
+      side = CUBLAS_SIDE_LEFT;
+      return true;
+    case SideMode::kRight: 
+      side = CUBLAS_SIDE_RIGHT;
+      return true;
+    default: break;
+  }
+
+  return false;
+}
+
+/// Maps a cutlass::FillMode to cuBLAS fill mode
+bool get_cublas_fill_mode(cublasFillMode_t& uplo, FillMode fill_mode) {
+
+  switch (fill_mode) {
+    case FillMode::kLower: 
+      uplo = CUBLAS_FILL_MODE_LOWER;
+      return true;
+    case FillMode::kUpper: 
+      uplo = CUBLAS_FILL_MODE_UPPER;
+      return true;
+    default: break;
+  }
+
+  return false;
+}
+
+/// Maps a cutlass::DiagType to cuBLAS diag type
+bool get_cublas_diag_type(cublasDiagType_t& diag, DiagType diag_type) {
+
+  switch (diag_type) {
+    case DiagType::kNonUnit: 
+      diag = CUBLAS_DIAG_NON_UNIT;
+      return true;
+    case DiagType::kUnit: 
+      diag = CUBLAS_DIAG_UNIT;
+      return true;
+    default: break;
+  }
+
+  return false;
+}
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Gets the cublas algorithm given threadblock tile dimensions and math opcode class
@@ -332,6 +386,773 @@ cublasStatus_t cublasGemmExDispatcher::operator()(cublasHandle_t handle) {
 } // namespace detail
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Returns a status if cuBLAS can satisfy a particular RankK description
+Status cublas_satisfies(library::RankKDescription const &desc) {
+  auto const &math_instruction = desc.tile_description.math_instruction;
+
+  if (math_instruction.element_accumulator == library::NumericTypeID::kS32 && 
+    math_instruction.opcode_class == library::OpcodeClassID::kTensorOp) {
+
+    return Status::kErrorNotSupported;
+  }
+
+  // output type S4 and S8 not supported in cuBLAS
+  if (desc.C.element == library::NumericTypeID::kS4 || 
+    desc.C.element == library::NumericTypeID::kS8) {
+
+    return Status::kErrorNotSupported;
+  }
+
+  // input type BF16 and TF32 not supported in cuBLAS
+  if (desc.A.element == library::NumericTypeID::kBF16 || 
+    desc.A.element == library::NumericTypeID::kTF32) {
+
+    return Status::kErrorNotSupported;
+  }
+
+  return Status::kSuccess;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+cublasRankKDispatcher::cublasRankKDispatcher(
+  library::RankKDescription const &op_desc,
+  library::RankKConfiguration configuration_,
+  library::RankKArguments arguments_
+):
+  configuration(configuration_), arguments(arguments_), status(Status::kSuccess) {
+
+  blas_mode = op_desc.blas_mode;
+  num_ranks = op_desc.num_ranks;
+
+  bool good = true;
+
+  good = (good && get_cublas_transpose_operation(trans_A, op_desc.A.layout, op_desc.transform_A));
+  good = (good && get_cublas_fill_mode(uplo, op_desc.fill_mode));
+  good = (good && get_cublas_datatype(data_type_A, op_desc.A.element));
+  good = (good && get_cublas_datatype(data_type_C, op_desc.C.element));
+
+  good = (good && get_cublas_datatype(
+    compute_data_type,
+    op_desc.tile_description.math_instruction.element_accumulator));
+
+  // cuBLAS introduces a separate cublasComputeType enumerant to more precisely describe
+  // internal numerical data types used in the computation.
+#if (__CUDACC_VER_MAJOR__ >= 11)
+  library::OpcodeClassID const & opcode_class =
+    op_desc.tile_description.math_instruction.opcode_class;
+
+  if (good &&
+    op_desc.A.element == library::NumericTypeID::kF32 &&
+    opcode_class == library::OpcodeClassID::kTensorOp) {
+
+    compute_type = CUBLAS_COMPUTE_32F_FAST_TF32;
+  }
+  else if (good) {
+    bool const isPedantic = false;
+    switch (compute_data_type) {
+      case CUDA_R_32F:
+      case CUDA_C_32F:
+        compute_type = isPedantic ? CUBLAS_COMPUTE_32F_PEDANTIC : CUBLAS_COMPUTE_32F;
+        break;
+      case CUDA_R_64F:
+      case CUDA_C_64F:
+        compute_type = isPedantic ? CUBLAS_COMPUTE_64F_PEDANTIC : CUBLAS_COMPUTE_64F;
+        break;
+      case CUDA_R_16F:
+        compute_type = isPedantic ? CUBLAS_COMPUTE_16F_PEDANTIC : CUBLAS_COMPUTE_16F;
+        break;
+      case CUDA_R_32I:
+        compute_type = isPedantic ? CUBLAS_COMPUTE_32I_PEDANTIC : CUBLAS_COMPUTE_32I;
+        break;
+      default:
+        good = false;
+        break;
+    }
+  }
+#endif // __CUDACC_VER_MAJOR__ >= 11
+
+  if (!good) {
+    status = Status::kErrorNotSupported;
+  }
+}
+
+/// Executes RankK using these arguments
+cublasStatus_t cublasRankKDispatcher::operator()(cublasHandle_t handle) {
+ 
+  // SYRK and HERK
+  if (num_ranks == 1) {
+    if (data_type_A == data_type_C && data_type_A == CUDA_R_64F) {
+      return cublasDsyrk(
+        handle,
+        uplo,
+        trans_A,
+        configuration.problem_size.n(),
+        configuration.problem_size.k(),
+        static_cast<const double*>(arguments.alpha),
+        static_cast<const double*>(arguments.A),
+        int(configuration.lda),
+        static_cast<const double*>(arguments.beta),
+        static_cast<double*>(arguments.D),
+        int(configuration.ldc)
+      );
+    } else if (data_type_A == data_type_C && data_type_A == CUDA_R_32F) {
+  
+  #if (__CUDACC_VER_MAJOR__ >= 11)
+      if (cublasSetMathMode(handle, CUBLAS_TF32_TENSOR_OP_MATH) != CUBLAS_STATUS_SUCCESS)
+        return CUBLAS_STATUS_NOT_SUPPORTED; 
+  #endif
+  
+      return cublasSsyrk(
+        handle,
+        uplo,
+        trans_A,
+        configuration.problem_size.n(),
+        configuration.problem_size.k(),
+        static_cast<const float*>(arguments.alpha),
+        static_cast<const float*>(arguments.A),
+        int(configuration.lda),
+        static_cast<const float*>(arguments.beta),
+        static_cast<float*>(arguments.D),
+        int(configuration.ldc)
+      );
+    } else if (data_type_A == data_type_C && data_type_A == CUDA_C_64F) {
+      
+        if (blas_mode == BlasMode::kHermitian) {
+          return cublasZherk(
+            handle,
+            uplo,
+            trans_A,
+            configuration.problem_size.n(),
+            configuration.problem_size.k(),
+            static_cast<const double*>(arguments.alpha),
+            static_cast<const cuDoubleComplex*>(arguments.A),
+            int(configuration.lda),
+            static_cast<const double*>(arguments.beta),
+            static_cast<cuDoubleComplex*>(arguments.D),
+            int(configuration.ldc)
+          );
+        }    
+        else {
+          return cublasZsyrk(
+            handle,
+            uplo,
+            trans_A,
+            configuration.problem_size.n(),
+            configuration.problem_size.k(),
+            static_cast<const cuDoubleComplex*>(arguments.alpha),
+            static_cast<const cuDoubleComplex*>(arguments.A),
+            int(configuration.lda),
+            static_cast<const cuDoubleComplex*>(arguments.beta),
+            static_cast<cuDoubleComplex*>(arguments.D),
+            int(configuration.ldc)
+          );
+        }
+  
+    } else if (data_type_A == data_type_C && data_type_A == CUDA_C_32F) {
+  
+  #if (__CUDACC_VER_MAJOR__ >= 11)
+      if (cublasSetMathMode(handle, CUBLAS_TF32_TENSOR_OP_MATH) != CUBLAS_STATUS_SUCCESS)
+        return CUBLAS_STATUS_NOT_SUPPORTED; 
+  #endif
+  
+      if (blas_mode == BlasMode::kHermitian) {
+        return cublasCherk(
+          handle,
+          uplo,
+          trans_A,
+          configuration.problem_size.n(),
+          configuration.problem_size.k(),
+          static_cast<const float*>(arguments.alpha),
+          static_cast<const cuComplex*>(arguments.A),
+          int(configuration.lda),
+          static_cast<const float*>(arguments.beta),
+          static_cast<cuComplex*>(arguments.D),
+          int(configuration.ldc)
+        );
+      }
+      else {
+        return cublasCsyrk(
+          handle,
+          uplo,
+          trans_A,
+          configuration.problem_size.n(),
+          configuration.problem_size.k(),
+          static_cast<const cuComplex*>(arguments.alpha),
+          static_cast<const cuComplex*>(arguments.A),
+          int(configuration.lda),
+          static_cast<const cuComplex*>(arguments.beta),
+          static_cast<cuComplex*>(arguments.D),
+          int(configuration.ldc)
+        );
+      }
+    } else {
+      return CUBLAS_STATUS_NOT_SUPPORTED;
+    }
+  } 
+
+  // SYR2K and HER2K
+  else if (num_ranks == 2) {
+    if (data_type_A == data_type_C && data_type_A == CUDA_R_64F) {
+      return cublasDsyr2k(
+        handle,
+        uplo,
+        trans_A,
+        configuration.problem_size.n(),
+        configuration.problem_size.k(),
+        static_cast<const double*>(arguments.alpha),
+        static_cast<const double*>(arguments.A),
+        int(configuration.lda),
+        static_cast<const double*>(arguments.B),
+        int(configuration.ldb),
+        static_cast<const double*>(arguments.beta),
+        static_cast<double*>(arguments.D),
+        int(configuration.ldc)
+      );
+    } else if (data_type_A == data_type_C && data_type_A == CUDA_R_32F) {
+  
+  #if (__CUDACC_VER_MAJOR__ >= 11)
+      if (cublasSetMathMode(handle, CUBLAS_TF32_TENSOR_OP_MATH) != CUBLAS_STATUS_SUCCESS)
+        return CUBLAS_STATUS_NOT_SUPPORTED; 
+  #endif
+  
+      return cublasSsyr2k(
+        handle,
+        uplo,
+        trans_A,
+        configuration.problem_size.n(),
+        configuration.problem_size.k(),
+        static_cast<const float*>(arguments.alpha),
+        static_cast<const float*>(arguments.A),
+        int(configuration.lda),
+        static_cast<const float*>(arguments.B),
+        int(configuration.ldb),
+        static_cast<const float*>(arguments.beta),
+        static_cast<float*>(arguments.D),
+        int(configuration.ldc)
+      );
+    } else if (data_type_A == data_type_C && data_type_A == CUDA_C_64F) {
+      
+        if (blas_mode == BlasMode::kHermitian) {
+          return cublasZher2k(
+            handle,
+            uplo,
+            trans_A,
+            configuration.problem_size.n(),
+            configuration.problem_size.k(),
+            static_cast<const cuDoubleComplex*>(arguments.alpha),
+            static_cast<const cuDoubleComplex*>(arguments.A),
+            int(configuration.lda),
+            static_cast<const cuDoubleComplex*>(arguments.B),
+            int(configuration.ldb),
+            static_cast<const double*>(arguments.beta),
+            static_cast<cuDoubleComplex*>(arguments.D),
+            int(configuration.ldc)
+          );
+        }    
+        else {
+          return cublasZsyr2k(
+            handle,
+            uplo,
+            trans_A,
+            configuration.problem_size.n(),
+            configuration.problem_size.k(),
+            static_cast<const cuDoubleComplex*>(arguments.alpha),
+            static_cast<const cuDoubleComplex*>(arguments.A),
+            int(configuration.lda),
+            static_cast<const cuDoubleComplex*>(arguments.B),
+            int(configuration.ldb),
+            static_cast<const cuDoubleComplex*>(arguments.beta),
+            static_cast<cuDoubleComplex*>(arguments.D),
+            int(configuration.ldc)
+          );
+        }
+  
+    } else if (data_type_A == data_type_C && data_type_A == CUDA_C_32F) {
+  
+  #if (__CUDACC_VER_MAJOR__ >= 11)
+      if (cublasSetMathMode(handle, CUBLAS_TF32_TENSOR_OP_MATH) != CUBLAS_STATUS_SUCCESS)
+        return CUBLAS_STATUS_NOT_SUPPORTED; 
+  #endif
+  
+      if (blas_mode == BlasMode::kHermitian) {
+        return cublasCher2k(
+          handle,
+          uplo,
+          trans_A,
+          configuration.problem_size.n(),
+          configuration.problem_size.k(),
+          static_cast<const cuComplex*>(arguments.alpha),
+          static_cast<const cuComplex*>(arguments.A),
+          int(configuration.lda),
+          static_cast<const cuComplex*>(arguments.B),
+          int(configuration.ldb),
+          static_cast<const float*>(arguments.beta),
+          static_cast<cuComplex*>(arguments.D),
+          int(configuration.ldc)
+        );
+      }
+      else {
+        return cublasCsyr2k(
+          handle,
+          uplo,
+          trans_A,
+          configuration.problem_size.n(),
+          configuration.problem_size.k(),
+          static_cast<const cuComplex*>(arguments.alpha),
+          static_cast<const cuComplex*>(arguments.A),
+          int(configuration.lda),
+          static_cast<const cuComplex*>(arguments.B),
+          int(configuration.ldb),
+          static_cast<const cuComplex*>(arguments.beta),
+          static_cast<cuComplex*>(arguments.D),
+          int(configuration.ldc)
+        );
+      }
+    } else {
+      return CUBLAS_STATUS_NOT_SUPPORTED;
+    }
+  }
+  else {
+    return CUBLAS_STATUS_NOT_SUPPORTED;
+  }
+}
+
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Returns a status if cuBLAS can satisfy a particular TRMM description
+Status cublas_satisfies(library::TrmmDescription const &desc) {
+  auto const &math_instruction = desc.tile_description.math_instruction;
+
+  if (math_instruction.element_accumulator == library::NumericTypeID::kS32 && 
+    math_instruction.opcode_class == library::OpcodeClassID::kTensorOp) {
+
+    return Status::kErrorNotSupported;
+  }
+
+  // output type S4 and S8 not supported in cuBLAS
+  if (desc.D.element == library::NumericTypeID::kS4 || 
+    desc.D.element == library::NumericTypeID::kS8) {
+
+    return Status::kErrorNotSupported;
+  }
+
+  // input type BF16 and TF32 not supported in cuBLAS
+  if (desc.A.element == library::NumericTypeID::kBF16 || 
+    desc.A.element == library::NumericTypeID::kTF32) {
+
+    return Status::kErrorNotSupported;
+  }
+
+  return Status::kSuccess;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+cublasTrmmDispatcher::cublasTrmmDispatcher(
+  library::TrmmDescription const &op_desc,
+  library::TrmmConfiguration configuration_,
+  library::TrmmArguments arguments_
+):
+  configuration(configuration_), arguments(arguments_), status(Status::kSuccess) {
+
+  bool good = true;
+
+  good = (good && get_cublas_transpose_operation(trans_A, op_desc.A.layout, op_desc.transform_A));
+  good = (good && get_cublas_side_mode(side, op_desc.side_mode));
+  good = (good && get_cublas_fill_mode(uplo, op_desc.fill_mode));
+  good = (good && get_cublas_diag_type(diag, op_desc.diag_type));
+  good = (good && get_cublas_datatype(data_type_A, op_desc.A.element));
+  good = (good && get_cublas_datatype(data_type_B, op_desc.B.element));
+  good = (good && get_cublas_datatype(data_type_D, op_desc.D.element));
+
+  // if A is Transposed, then for cuBLAS that is inverted Fill Mode. 
+  if (trans_A == CUBLAS_OP_T || trans_A == CUBLAS_OP_C) {
+    if (uplo == CUBLAS_FILL_MODE_LOWER)
+      uplo = CUBLAS_FILL_MODE_UPPER;
+    else
+      uplo = CUBLAS_FILL_MODE_LOWER;
+  }
+
+  good = (good && get_cublas_datatype(
+    compute_data_type,
+    op_desc.tile_description.math_instruction.element_accumulator));
+
+  // cuBLAS introduces a separate cublasComputeType enumerant to more precisely describe
+  // internal numerical data types used in the computation.
+#if (__CUDACC_VER_MAJOR__ >= 11)
+  library::OpcodeClassID const & opcode_class =
+    op_desc.tile_description.math_instruction.opcode_class;
+
+  if (good &&
+    op_desc.A.element == library::NumericTypeID::kF32 &&
+    opcode_class == library::OpcodeClassID::kTensorOp) {
+
+    compute_type = CUBLAS_COMPUTE_32F_FAST_TF32;
+  }
+  else if (good) {
+    bool const isPedantic = false;
+    switch (compute_data_type) {
+      case CUDA_R_32F:
+      case CUDA_C_32F:
+        compute_type = isPedantic ? CUBLAS_COMPUTE_32F_PEDANTIC : CUBLAS_COMPUTE_32F;
+        break;
+      case CUDA_R_64F:
+      case CUDA_C_64F:
+        compute_type = isPedantic ? CUBLAS_COMPUTE_64F_PEDANTIC : CUBLAS_COMPUTE_64F;
+        break;
+      case CUDA_R_16F:
+        compute_type = isPedantic ? CUBLAS_COMPUTE_16F_PEDANTIC : CUBLAS_COMPUTE_16F;
+        break;
+      case CUDA_R_32I:
+        compute_type = isPedantic ? CUBLAS_COMPUTE_32I_PEDANTIC : CUBLAS_COMPUTE_32I;
+        break;
+      default:
+        good = false;
+        break;
+    }
+  }
+#endif // __CUDACC_VER_MAJOR__ >= 11
+
+  if (!good) {
+    status = Status::kErrorNotSupported;
+  }
+}
+
+/// Executes TRMM using these arguments
+cublasStatus_t cublasTrmmDispatcher::operator()(cublasHandle_t handle) {
+ 
+  if (data_type_A == data_type_D && data_type_A == CUDA_R_64F) {
+    return cublasDtrmm(
+      handle,
+      side,
+      uplo,
+      trans_A,
+      diag,
+      configuration.problem_size.m(),
+      configuration.problem_size.n(),
+      static_cast<const double*>(arguments.alpha),
+      static_cast<const double*>(arguments.A),
+      int(configuration.lda),
+      static_cast<const double*>(arguments.B),
+      int(configuration.ldb),
+      static_cast<double*>(arguments.D),
+      int(configuration.ldd)
+    );
+  } else if (data_type_A == data_type_D && data_type_A == CUDA_R_32F) {
+
+#if (__CUDACC_VER_MAJOR__ >= 11)
+    if (cublasSetMathMode(handle, CUBLAS_TF32_TENSOR_OP_MATH) != CUBLAS_STATUS_SUCCESS)
+      return CUBLAS_STATUS_NOT_SUPPORTED; 
+#endif
+
+    return cublasStrmm(
+      handle,
+      side,
+      uplo,
+      trans_A,
+      diag,
+      configuration.problem_size.m(),
+      configuration.problem_size.n(),
+      static_cast<const float*>(arguments.alpha),
+      static_cast<const float*>(arguments.A),
+      int(configuration.lda),
+      static_cast<const float*>(arguments.B),
+      int(configuration.ldb),
+      static_cast<float*>(arguments.D),
+      int(configuration.ldd)
+    );
+  } else if (data_type_A == data_type_D && data_type_A == CUDA_C_64F) {
+    return cublasZtrmm(
+      handle,
+      side,
+      uplo,
+      trans_A,
+      diag,
+      configuration.problem_size.m(),
+      configuration.problem_size.n(),
+      static_cast<const cuDoubleComplex*>(arguments.alpha),
+      static_cast<const cuDoubleComplex*>(arguments.A),
+      int(configuration.lda),
+      static_cast<const cuDoubleComplex*>(arguments.B),
+      int(configuration.ldb),
+      static_cast<cuDoubleComplex*>(arguments.D),
+      int(configuration.ldd)
+    );
+  } else if (data_type_A == data_type_D && data_type_A == CUDA_C_32F) {
+
+#if (__CUDACC_VER_MAJOR__ >= 11)
+    if (cublasSetMathMode(handle, CUBLAS_TF32_TENSOR_OP_MATH) != CUBLAS_STATUS_SUCCESS)
+      return CUBLAS_STATUS_NOT_SUPPORTED; 
+#endif
+
+    return cublasCtrmm(
+      handle,
+      side,
+      uplo,
+      trans_A,
+      diag,
+      configuration.problem_size.m(),
+      configuration.problem_size.n(),
+      static_cast<const cuComplex*>(arguments.alpha),
+      static_cast<const cuComplex*>(arguments.A),
+      int(configuration.lda),
+      static_cast<const cuComplex*>(arguments.B),
+      int(configuration.ldb),
+      static_cast<cuComplex*>(arguments.D),
+      int(configuration.ldd)
+    );
+  } else {
+    return CUBLAS_STATUS_NOT_SUPPORTED;
+  }
+}
+
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Returns a status if cuBLAS can satisfy a particular Symm description
+Status cublas_satisfies(library::SymmDescription const &desc) {
+  auto const &math_instruction = desc.tile_description.math_instruction;
+
+  if (math_instruction.element_accumulator == library::NumericTypeID::kS32 && 
+    math_instruction.opcode_class == library::OpcodeClassID::kTensorOp) {
+
+    return Status::kErrorNotSupported;
+  }
+
+  // output type S4 and S8 not supported in cuBLAS
+  if (desc.C.element == library::NumericTypeID::kS4 || 
+    desc.C.element == library::NumericTypeID::kS8) {
+
+    return Status::kErrorNotSupported;
+  }
+
+  // input type BF16 and TF32 not supported in cuBLAS
+  if (desc.A.element == library::NumericTypeID::kBF16 || 
+    desc.A.element == library::NumericTypeID::kTF32) {
+
+    return Status::kErrorNotSupported;
+  }
+
+  // input type BF16 and TF32 not supported in cuBLAS
+  if (desc.B.element == library::NumericTypeID::kBF16 || 
+    desc.B.element == library::NumericTypeID::kTF32) {
+
+    return Status::kErrorNotSupported;
+  }
+
+  // only column major layout is supported in cuBLAS
+  if (desc.A.layout != library::LayoutTypeID::kColumnMajor || 
+      desc.transform_A != library::ComplexTransform::kNone) {
+  
+    return Status::kErrorNotSupported;
+}
+
+  return Status::kSuccess;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+cublasSymmDispatcher::cublasSymmDispatcher(
+  library::SymmDescription const &op_desc,
+  library::SymmConfiguration configuration_,
+  library::SymmArguments arguments_
+):
+  configuration(configuration_), arguments(arguments_), status(Status::kSuccess) {
+
+  blas_mode = op_desc.blas_mode;
+
+  bool good = true;
+
+  good = (good && get_cublas_side_mode(side, op_desc.side_mode));
+  good = (good && get_cublas_fill_mode(uplo, op_desc.fill_mode));
+  good = (good && get_cublas_datatype(data_type_A, op_desc.A.element));
+  good = (good && get_cublas_datatype(data_type_C, op_desc.C.element));
+
+  good = (good && get_cublas_datatype(
+    compute_data_type,
+    op_desc.tile_description.math_instruction.element_accumulator));
+
+  // cuBLAS introduces a separate cublasComputeType enumerant to more precisely describe
+  // internal numerical data types used in the computation.
+#if (__CUDACC_VER_MAJOR__ >= 11)
+  library::OpcodeClassID const & opcode_class =
+    op_desc.tile_description.math_instruction.opcode_class;
+
+  if (good &&
+    op_desc.A.element == library::NumericTypeID::kF32 &&
+    opcode_class == library::OpcodeClassID::kTensorOp) {
+
+    compute_type = CUBLAS_COMPUTE_32F_FAST_TF32;
+  }
+  else if (good) {
+    bool const isPedantic = false;
+    switch (compute_data_type) {
+      case CUDA_R_32F:
+      case CUDA_C_32F:
+        compute_type = isPedantic ? CUBLAS_COMPUTE_32F_PEDANTIC : CUBLAS_COMPUTE_32F;
+        break;
+      case CUDA_R_64F:
+      case CUDA_C_64F:
+        compute_type = isPedantic ? CUBLAS_COMPUTE_64F_PEDANTIC : CUBLAS_COMPUTE_64F;
+        break;
+      case CUDA_R_16F:
+        compute_type = isPedantic ? CUBLAS_COMPUTE_16F_PEDANTIC : CUBLAS_COMPUTE_16F;
+        break;
+      case CUDA_R_32I:
+        compute_type = isPedantic ? CUBLAS_COMPUTE_32I_PEDANTIC : CUBLAS_COMPUTE_32I;
+        break;
+      default:
+        good = false;
+        break;
+    }
+  }
+#endif // __CUDACC_VER_MAJOR__ >= 11
+
+  if (!good) {
+    status = Status::kErrorNotSupported;
+  }
+}
+
+/// Executes Symm using these arguments
+cublasStatus_t cublasSymmDispatcher::operator()(cublasHandle_t handle) {
+ 
+  // SYMM and HEMM
+  if (data_type_A == data_type_C && data_type_A == CUDA_R_64F) {
+    return cublasDsymm(
+      handle,
+      side,
+      uplo,
+      configuration.problem_size.m(),
+      configuration.problem_size.n(),
+      static_cast<const double*>(arguments.alpha),
+      static_cast<const double*>(arguments.A),
+      int(configuration.lda),
+      static_cast<const double*>(arguments.B),
+      int(configuration.ldb),
+      static_cast<const double*>(arguments.beta),
+      static_cast<double*>(arguments.D),
+      int(configuration.ldc)
+    );
+  } else if (data_type_A == data_type_C && data_type_A == CUDA_R_32F) {
+
+#if (__CUDACC_VER_MAJOR__ >= 11)
+    if (cublasSetMathMode(handle, CUBLAS_TF32_TENSOR_OP_MATH) != CUBLAS_STATUS_SUCCESS)
+      return CUBLAS_STATUS_NOT_SUPPORTED; 
+#endif
+
+    return cublasSsymm(
+      handle,
+      side,
+      uplo,
+      configuration.problem_size.m(),
+      configuration.problem_size.n(),
+      static_cast<const float*>(arguments.alpha),
+      static_cast<const float*>(arguments.A),
+      int(configuration.lda),
+      static_cast<const float*>(arguments.B),
+      int(configuration.ldb),
+      static_cast<const float*>(arguments.beta),
+      static_cast<float*>(arguments.D),
+      int(configuration.ldc)
+    );
+  } else if (data_type_A == data_type_C && data_type_A == CUDA_C_64F) {
+    
+      if (blas_mode == BlasMode::kHermitian) {
+        return cublasZhemm(
+          handle,
+          side,
+          uplo,
+          configuration.problem_size.m(),
+          configuration.problem_size.n(),
+          static_cast<const cuDoubleComplex*>(arguments.alpha),
+          static_cast<const cuDoubleComplex*>(arguments.A),
+          int(configuration.lda),
+          static_cast<const cuDoubleComplex*>(arguments.B),
+          int(configuration.ldb),
+          static_cast<const cuDoubleComplex*>(arguments.beta),
+          static_cast<cuDoubleComplex*>(arguments.D),
+          int(configuration.ldc)
+        );
+      }    
+      else {
+        return cublasZsymm(
+          handle,
+          side,
+          uplo,
+          configuration.problem_size.m(),
+          configuration.problem_size.n(),
+          static_cast<const cuDoubleComplex*>(arguments.alpha),
+          static_cast<const cuDoubleComplex*>(arguments.A),
+          int(configuration.lda),
+          static_cast<const cuDoubleComplex*>(arguments.B),
+          int(configuration.ldb),
+          static_cast<const cuDoubleComplex*>(arguments.beta),
+          static_cast<cuDoubleComplex*>(arguments.D),
+          int(configuration.ldc)
+        );
+      }
+
+  } else if (data_type_A == data_type_C && data_type_A == CUDA_C_32F) {
+
+#if (__CUDACC_VER_MAJOR__ >= 11)
+    if (cublasSetMathMode(handle, CUBLAS_TF32_TENSOR_OP_MATH) != CUBLAS_STATUS_SUCCESS)
+      return CUBLAS_STATUS_NOT_SUPPORTED; 
+#endif
+
+    if (blas_mode == BlasMode::kHermitian) {
+      return cublasChemm(
+        handle,
+        side,
+        uplo,
+        configuration.problem_size.m(),
+        configuration.problem_size.n(),
+        static_cast<const cuComplex*>(arguments.alpha),
+        static_cast<const cuComplex*>(arguments.A),
+        int(configuration.lda),
+        static_cast<const cuComplex*>(arguments.B),
+        int(configuration.ldb),
+        static_cast<const cuComplex*>(arguments.beta),
+        static_cast<cuComplex*>(arguments.D),
+        int(configuration.ldc)
+      );
+    }
+    else {
+      return cublasCsymm(
+        handle,
+        side,
+        uplo,
+        configuration.problem_size.m(),
+        configuration.problem_size.n(),
+        static_cast<const cuComplex*>(arguments.alpha),
+        static_cast<const cuComplex*>(arguments.A),
+        int(configuration.lda),
+        static_cast<const cuComplex*>(arguments.B),
+        int(configuration.ldb),
+        static_cast<const cuComplex*>(arguments.beta),
+        static_cast<cuComplex*>(arguments.D),
+        int(configuration.ldc)
+      );
+    }
+  } else {
+    return CUBLAS_STATUS_NOT_SUPPORTED;
+  }
+}
+
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
 } // namespace profiler
 } // namespace cutlass
 
diff --git a/tools/profiler/src/cublas_helpers.h b/tools/profiler/src/cublas_helpers.h
index a899f9ed..90bf8924 100644
--- a/tools/profiler/src/cublas_helpers.h
+++ b/tools/profiler/src/cublas_helpers.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -34,6 +40,8 @@
 #include "cutlass/cutlass.h"
 #include "cutlass/library/library.h"
 #include "cutlass/library/util.h"
+#include "cutlass/blas3.h"
+
 #include "options.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -68,6 +76,15 @@ cublasGemmAlgo_t get_cublas_gemm_algo(
 /// Returns a status if cuBLAS can satisfy a particular GEMM description
 Status cublas_satisfies(library::GemmDescription const &desc);
 
+/// Returns a status if cuBLAS can satisfy a particular RankK description
+Status cublas_satisfies(library::RankKDescription const &desc);
+
+/// Returns a status if cuBLAS can satisfy a particular TRMM description
+Status cublas_satisfies(library::TrmmDescription const &desc);
+
+/// Returns a status if cuBLAS can satisfy a particular SYMM/HEMM description
+Status cublas_satisfies(library::SymmDescription const &desc);
+
 /// This is a helper class to create cublasHandle_t automatically on CublasCreate object creation and 
 /// to destroy cublasHandle_t on CublasCreate object destruction. 
 /// Additionaly, it provides implicit cast from CublasCreate's object to cublasHandle_t's object
@@ -211,6 +228,127 @@ struct cublasGemmExDispatcher {
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
+/// Dispatcher to cublas rank k update kernels 
+struct cublasRankKDispatcher {
+
+  //
+  // Data members
+  //
+  library::RankKConfiguration configuration;
+  library::RankKArguments arguments;
+
+  // cublass-specific data structures to fill cublas API call arguments
+  cublasOperation_t trans_A;
+  cublasFillMode_t uplo;
+  cudaDataType_t data_type_A;
+  cudaDataType_t data_type_C;
+  cudaDataType_t compute_data_type;
+
+#if (__CUDACC_VER_MAJOR__ >= 11)
+  cublasComputeType_t compute_type;
+#endif
+
+  int num_ranks;       //(rank-k or rank-2k)
+  BlasMode blas_mode; //(symmetric or hermitian)
+  Status status;
+  
+  //
+  // Methods
+  //
+
+  cublasRankKDispatcher( 
+    library::RankKDescription const &op_desc,
+    library::RankKConfiguration configuration_,
+    library::RankKArguments arguments_
+  );
+
+  /// Executes RankK using these arguments
+  cublasStatus_t operator()(cublasHandle_t handle);
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Dispatcher to cublasTrmm() 
+struct cublasTrmmDispatcher {
+
+  //
+  // Data members
+  //
+  library::TrmmConfiguration configuration;
+  library::TrmmArguments arguments;
+
+  // cublass-specific data structures to fill cublas API call arguments
+  cublasOperation_t trans_A;
+  cublasSideMode_t side;
+  cublasFillMode_t uplo;
+  cublasDiagType_t diag;
+  cudaDataType_t data_type_A;
+  cudaDataType_t data_type_B;
+  cudaDataType_t data_type_D;
+  cudaDataType_t compute_data_type;
+
+#if (__CUDACC_VER_MAJOR__ >= 11)
+  cublasComputeType_t compute_type;
+#endif
+
+  Status status;
+  
+  //
+  // Methods
+  //
+
+  cublasTrmmDispatcher( 
+    library::TrmmDescription const &op_desc,
+    library::TrmmConfiguration configuration_,
+    library::TrmmArguments arguments_
+  );
+
+  /// Executes TRMM using these arguments
+  cublasStatus_t operator()(cublasHandle_t handle);
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Dispatcher to cublas symm/hemm update kernels 
+struct cublasSymmDispatcher {
+
+  //
+  // Data members
+  //
+  library::SymmConfiguration configuration;
+  library::SymmArguments arguments;
+
+  // cublass-specific data structures to fill cublas API call arguments
+  cublasSideMode_t side;
+  cublasFillMode_t uplo;
+  cudaDataType_t data_type_A;
+  cudaDataType_t data_type_B;
+  cudaDataType_t data_type_C;
+  cudaDataType_t compute_data_type;
+
+#if (__CUDACC_VER_MAJOR__ >= 11)
+  cublasComputeType_t compute_type;
+#endif
+  
+  BlasMode blas_mode; //(symmetric or hermitian)
+  Status status;
+  
+  //
+  // Methods
+  //
+
+  cublasSymmDispatcher( 
+    library::SymmDescription const &op_desc,
+    library::SymmConfiguration configuration_,
+    library::SymmArguments arguments_
+  );
+
+  /// Executes Symm using these arguments
+  cublasStatus_t operator()(cublasHandle_t handle);
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
 } // namespace detail
 
 } // namespace profiler
diff --git a/tools/profiler/src/cudnn_helpers.cpp b/tools/profiler/src/cudnn_helpers.cpp
index adb6f76b..0ab6c12a 100644
--- a/tools/profiler/src/cudnn_helpers.cpp
+++ b/tools/profiler/src/cudnn_helpers.cpp
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/profiler/src/cudnn_helpers.h b/tools/profiler/src/cudnn_helpers.h
index 5b21ac05..f18e78a0 100644
--- a/tools/profiler/src/cudnn_helpers.h
+++ b/tools/profiler/src/cudnn_helpers.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/profiler/src/cutlass_profiler.cu b/tools/profiler/src/cutlass_profiler.cu
index 637fef4f..a3893a4d 100644
--- a/tools/profiler/src/cutlass_profiler.cu
+++ b/tools/profiler/src/cutlass_profiler.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -32,6 +38,10 @@
 // Profiler includes
 #include "cutlass_profiler.h"
 #include "gemm_operation_profiler.h"
+#include "rank_k_operation_profiler.h"
+#include "rank_2k_operation_profiler.h"
+#include "trmm_operation_profiler.h"
+#include "symm_operation_profiler.h"
 #include "conv2d_operation_profiler.h"          
 #include "conv3d_operation_profiler.h"          
 #include "sparse_gemm_operation_profiler.h"
@@ -56,6 +66,13 @@ CutlassProfiler::CutlassProfiler(
 
   operation_profilers_.emplace_back(new Conv3dOperationProfiler(options));
 
+  operation_profilers_.emplace_back(new RankKOperationProfiler(options));
+
+  operation_profilers_.emplace_back(new Rank2KOperationProfiler(options));
+
+  operation_profilers_.emplace_back(new TrmmOperationProfiler(options));
+
+  operation_profilers_.emplace_back(new SymmOperationProfiler(options));
 }
 
 CutlassProfiler::~CutlassProfiler() {
@@ -174,6 +191,9 @@ void CutlassProfiler::print_usage_(std::ostream &out) {
 
   out << "\n\nFor details about a particular function, specify the function name with --help.\n\nExample:\n\n"
     << "  $ cutlass_profiler --operation=Gemm --help\n\n"
+    << "  $ cutlass_profiler --operation=RankK --help\n\n"
+    << "  $ cutlass_profiler --operation=Trmm --help\n\n"
+    << "  $ cutlass_profiler --operation=Symm --help\n\n"
     << "  $ cutlass_profiler --operation=Conv3d --help\n\n"         
     << "  $ cutlass_profiler --operation=Conv2d --help\n\n"         
     << "  $ cutlass_profiler --operation=SparseGemm --help\n\n"
diff --git a/tools/profiler/src/cutlass_profiler.h b/tools/profiler/src/cutlass_profiler.h
index 57063573..a566357e 100644
--- a/tools/profiler/src/cutlass_profiler.h
+++ b/tools/profiler/src/cutlass_profiler.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/profiler/src/debug.h b/tools/profiler/src/debug.h
index 828d4218..ae041658 100644
--- a/tools/profiler/src/debug.h
+++ b/tools/profiler/src/debug.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/profiler/src/device_allocation.cu b/tools/profiler/src/device_allocation.cu
index 7f6b2691..69389e5a 100644
--- a/tools/profiler/src/device_allocation.cu
+++ b/tools/profiler/src/device_allocation.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/profiler/src/device_allocation.h b/tools/profiler/src/device_allocation.h
index b809884a..7053bbcc 100644
--- a/tools/profiler/src/device_allocation.h
+++ b/tools/profiler/src/device_allocation.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/profiler/src/device_context.cu b/tools/profiler/src/device_context.cu
index 95737e13..8f831f37 100644
--- a/tools/profiler/src/device_context.cu
+++ b/tools/profiler/src/device_context.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/profiler/src/device_context.h b/tools/profiler/src/device_context.h
index 595332fd..808ab6c6 100644
--- a/tools/profiler/src/device_context.h
+++ b/tools/profiler/src/device_context.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/profiler/src/enumerated_types.cpp b/tools/profiler/src/enumerated_types.cpp
index bad888a9..3b8a283e 100644
--- a/tools/profiler/src/enumerated_types.cpp
+++ b/tools/profiler/src/enumerated_types.cpp
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/profiler/src/enumerated_types.h b/tools/profiler/src/enumerated_types.h
index 5d9d9703..9e04e9da 100644
--- a/tools/profiler/src/enumerated_types.h
+++ b/tools/profiler/src/enumerated_types.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/profiler/src/gemm_operation_profiler.cu b/tools/profiler/src/gemm_operation_profiler.cu
index 298785b2..ff5ba645 100644
--- a/tools/profiler/src/gemm_operation_profiler.cu
+++ b/tools/profiler/src/gemm_operation_profiler.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2022, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/profiler/src/gemm_operation_profiler.h b/tools/profiler/src/gemm_operation_profiler.h
index 0f3096f7..396aea8d 100644
--- a/tools/profiler/src/gemm_operation_profiler.h
+++ b/tools/profiler/src/gemm_operation_profiler.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2022, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/profiler/src/gpu_timer.cpp b/tools/profiler/src/gpu_timer.cpp
index ef2892f4..fddc63a1 100644
--- a/tools/profiler/src/gpu_timer.cpp
+++ b/tools/profiler/src/gpu_timer.cpp
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/profiler/src/gpu_timer.h b/tools/profiler/src/gpu_timer.h
index 93517198..7baedd01 100644
--- a/tools/profiler/src/gpu_timer.h
+++ b/tools/profiler/src/gpu_timer.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/profiler/src/main.cpp b/tools/profiler/src/main.cpp
index dc8ad31e..8a1f16eb 100644
--- a/tools/profiler/src/main.cpp
+++ b/tools/profiler/src/main.cpp
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/profiler/src/operation_profiler.cu b/tools/profiler/src/operation_profiler.cu
index 84c3aeb1..6c51b024 100644
--- a/tools/profiler/src/operation_profiler.cu
+++ b/tools/profiler/src/operation_profiler.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/profiler/src/operation_profiler.h b/tools/profiler/src/operation_profiler.h
index 70a9d0ec..74e5416c 100644
--- a/tools/profiler/src/operation_profiler.h
+++ b/tools/profiler/src/operation_profiler.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/profiler/src/options.cu b/tools/profiler/src/options.cu
index 404c165f..a1d98a4a 100644
--- a/tools/profiler/src/options.cu
+++ b/tools/profiler/src/options.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/profiler/src/options.h b/tools/profiler/src/options.h
index d8b69fc6..8ec1736b 100644
--- a/tools/profiler/src/options.h
+++ b/tools/profiler/src/options.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/profiler/src/performance_report.cpp b/tools/profiler/src/performance_report.cpp
index 0724db50..f3ad92fd 100644
--- a/tools/profiler/src/performance_report.cpp
+++ b/tools/profiler/src/performance_report.cpp
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/profiler/src/performance_report.h b/tools/profiler/src/performance_report.h
index c2c7ec3b..c536fe7b 100644
--- a/tools/profiler/src/performance_report.h
+++ b/tools/profiler/src/performance_report.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/profiler/src/performance_result.cu b/tools/profiler/src/performance_result.cu
index 863728ac..2a282b01 100644
--- a/tools/profiler/src/performance_result.cu
+++ b/tools/profiler/src/performance_result.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/profiler/src/performance_result.h b/tools/profiler/src/performance_result.h
index 870bb665..7b611f63 100644
--- a/tools/profiler/src/performance_result.h
+++ b/tools/profiler/src/performance_result.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/profiler/src/problem_space.cpp b/tools/profiler/src/problem_space.cpp
index 9fedb04f..892987d6 100644
--- a/tools/profiler/src/problem_space.cpp
+++ b/tools/profiler/src/problem_space.cpp
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/profiler/src/problem_space.h b/tools/profiler/src/problem_space.h
index cf6ea5fc..be274fce 100644
--- a/tools/profiler/src/problem_space.h
+++ b/tools/profiler/src/problem_space.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/profiler/src/rank_2k_operation_profiler.cu b/tools/profiler/src/rank_2k_operation_profiler.cu
new file mode 100644
index 00000000..c5023be1
--- /dev/null
+++ b/tools/profiler/src/rank_2k_operation_profiler.cu
@@ -0,0 +1,727 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Execution environment
+
+  
+*/
+
+#include <iostream>
+#include <stdexcept>
+#include <iomanip>
+#include <ios>
+
+#include "cutlass/core_io.h"
+
+#include "cublas_helpers.h"
+#include "rank_2k_operation_profiler.h"
+#include "gpu_timer.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace profiler {
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Ctor
+Rank2KOperationProfiler::Rank2KOperationProfiler(Options const &options): 
+  OperationProfiler(
+    options,
+    library::OperationKind::kRank2K,
+    {
+      {ArgumentTypeID::kEnumerated, {"rank_k_kind"}, "Variant of RankK (universal)"},
+      {ArgumentTypeID::kInteger, {"n", "problem-size::n"}, "N dimension of the RankK problem space"},
+      {ArgumentTypeID::kInteger, {"k", "problem-size::k"}, "K dimension of the RankK problem space"},
+      {ArgumentTypeID::kTensor, {"A"}, "Tensor storing the A operand"},
+      {ArgumentTypeID::kTensor, {"B"}, "Tensor storing the B operand"},
+      {ArgumentTypeID::kTensor, {"C"}, "Tensor storing the C operand"},
+      {ArgumentTypeID::kEnumerated, {"fill_mode"}, "Fill Mode for RankK kernel (lower or upper)"},
+      {ArgumentTypeID::kEnumerated, {"blas_mode"}, "Blas Mode for RankK kernel (symmetric or hermitian)"},
+      {ArgumentTypeID::kScalar, {"alpha", "epilogue::alpha"}, "Epilogue scalar alpha"},
+      {ArgumentTypeID::kScalar, {"beta", "epilogue::beta"}, "Epilogue scalar beta"},
+      {ArgumentTypeID::kInteger, {"split_k_slices", "split-k-slices"}, "Number of partitions of K dimension"},
+      {ArgumentTypeID::kInteger, {"batch_count", "batch-count"}, "Number of RankK computed in one batch"},
+    },
+    { library::Provider::kCUBLAS}
+  ) {
+  description_ = "      Rank 2k Update. D = alpha * (A*B^T + B*A^T) + beta * C (symmetric) or D = alpha * (A*B^H+B*A^H) + beta * C (hermitian)";
+}
+
+/// Destructor
+Rank2KOperationProfiler::~Rank2KOperationProfiler() {
+
+}
+
+/// Prints usage statement for the math function
+void Rank2KOperationProfiler::print_usage(std::ostream &out) const {
+  out << "RankK" << "\n\n";
+
+  OperationProfiler::print_usage(out);
+}
+
+/// Prints examples
+void Rank2KOperationProfiler::print_examples(std::ostream &out) const {
+
+  out << "\nExamples:\n\n"
+    << "Profile a particular problem size Syrk kernel:\n"
+    << "  $ cutlass_profiler --operation=rank_2k --blas_mode=symmetric --n=1024 --k=128\n\n"
+    
+    << "Profile a particular problem size Herk kernel:\n"
+    << "  $ cutlass_profiler --operation=rank_2k --blas_mode=hermitian --n=1024 --k=128\n\n"
+
+    << "Schmoo over problem size and beta:\n"
+    << "  $ cutlass_profiler --operation=rank_2k --n=1024:4096:256 --k=128:8192:128 --beta=0,1,2.5\n\n"
+
+    << "Schmoo over accumulator types:\n"
+    << "  $ cutlass_profiler --operation=rank_2k --accumulator-type=f16,f32\n\n"
+
+    << "Schmoo over fill modees:\n"
+    << "  $ cutlass_profiler --operation=rank_2k --fill_mode=lower/upper\n\n"
+
+    << "Run when A is f16 with column-major or A is any datatype with row-major (For column major, use column, col, or n. For row major use, row or t):\n"
+    << "  $ cutlass_profiler --operation=rank_2k --A=f16:column or --A=*:row\n\n"
+
+    << "Using various input value distribution:\n"
+    << "  $ cutlass_profiler --operation=rank_2k --dist=uniform,min:0,max:3\n"
+    << "  $ cutlass_profiler --operation=rank_2k --dist=gaussian,mean:0,stddev:3\n"
+    << "  $ cutlass_profiler --operation=rank_2k --dist=sequential,start:0,delta:1\n\n"
+
+    << "Run a kernel with cta tile size of 256x128x32 and save workspace if results are incorrect (note that --cta-tile::k=32 is default cta-tile size):\n"
+    << " $ cutlass_profiler --operation=rank_2k --cta_m=256 --cta_n=128  --cta_k=32 --save-workspace=incorrect\n\n"
+    
+    << "Test your changes to rank_2k kernels with a quick functional test and save results in functional-test.csv:\n"
+    << " $ cutlass_profiler  --operation=rank_2k \\ \n"
+    << "   --n=8,56,120,136,256,264,512,520,1024,1032,4096,8192,16384 \\ \n"
+    << "   --k=8,16,32,64,128,256,288,384,504,512,520 \\ \n"
+    << "   --beta=0,1,2 --profiling-iterations=1 \\ \n"
+    << "   --providers=cutlass --output=functional-test.csv\n\n";
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if 0
+// used this for debugging
+static std::string byte_string(std::vector<uint8_t> const &bytes) {
+  std::stringstream ss;
+
+  ss << "0x";
+
+  for (size_t idx = bytes.size(); idx > 0; --idx) {
+    ss << std::hex << std::setw(2) << std::setfill('0') << uint32_t(bytes.at(idx - 1));
+  }
+
+  return ss.str();
+}
+#endif
+
+Status Rank2KOperationProfiler::RankKProblem::parse(
+  library::RankKDescription const &operation_desc,
+  ProblemSpace const &problem_space,
+  ProblemSpace::Problem const &problem) {
+  
+  if (!arg_as_int(this->n, "n", problem_space, problem)) {
+    // default value
+    this->n = 1024;
+  }
+  
+  if (!arg_as_int(this->k, "k", problem_space, problem)) {
+    // default value
+    this->k = 1024;
+  }
+  
+  if (!arg_as_int(this->split_k_slices, "split_k_slices", problem_space, problem)) {
+    // default value
+    this->split_k_slices = 1;
+  }
+  
+  if (!arg_as_int(this->batch_count, "batch_count", problem_space, problem)) {
+    // default value
+    this->batch_count = 1;
+  }
+
+  if (this->split_k_slices > 1 && this->batch_count > 1) {
+    // At least one of these must be one
+    return Status::kErrorInvalidProblem;
+  }
+
+  if (!tensor_description_satisfies(operation_desc.A, "A", problem_space, problem)) {
+    return Status::kErrorInvalidProblem;
+  }
+
+  if (!tensor_description_satisfies(operation_desc.B, "B", problem_space, problem)) {
+    return Status::kErrorInvalidProblem;
+  }
+
+  if (!tensor_description_satisfies(operation_desc.C, "C", problem_space, problem)) {
+    return Status::kErrorInvalidProblem;
+  }
+
+  if (!arg_as_scalar(
+    this->alpha, 
+    operation_desc.element_epilogue, 
+    "alpha", 
+    problem_space, 
+    problem)) {
+
+    if (!cast_from_double(this->alpha, operation_desc.element_epilogue, 1)) {
+      return Status::kErrorInternal;
+    }
+  }
+  
+  if (!arg_as_scalar(
+    this->beta, 
+    operation_desc.element_epilogue, 
+    "beta", 
+    problem_space, 
+    problem)) {
+    
+    if (!cast_from_double(this->beta, operation_desc.element_epilogue, 0)) {
+      return Status::kErrorInternal;
+    }
+  }
+  
+  this->lda = DeviceAllocation::get_packed_layout(
+    operation_desc.A.layout, {int(this->n), int(this->k)}).front();
+
+  this->ldb = DeviceAllocation::get_packed_layout(
+    operation_desc.B.layout, {int(this->n), int(this->k)}).front();
+
+  this->ldc = DeviceAllocation::get_packed_layout(
+    operation_desc.C.layout, {int(this->n), int(this->n)}).front();
+
+  return Status::kSuccess;
+}
+
+/// Total number of bytes loaded
+int64_t Rank2KOperationProfiler::RankKProblem::bytes(library::RankKDescription const &operation_desc) const {
+  // Input bytes read and Output bytes written for the gemm problem
+  int64_t bytes =
+    2 * int64_t(library::sizeof_bits(operation_desc.A.element) * n / 8) * k +
+    2 * int64_t(library::sizeof_bits(operation_desc.B.element) * n / 8) * k +
+    // Half matrix including the diagonal will have (N*(N+1))/2 elements
+    int64_t(library::sizeof_bits(operation_desc.C.element) * n / 8) * (n+1) / 2;
+
+  // Set is_beta_zero true if beta is zero
+  bool is_beta_zero = std::all_of(beta.begin(), beta.end(), [](uint8_t i) { return i==0; });
+
+  // Output bytes read for the gemm problem for non-zero beta values
+  if (!is_beta_zero) {
+    bytes += int64_t(library::sizeof_bits(operation_desc.C.element) * n / 8) * (n+1) / 2;
+  }
+
+  bytes *= batch_count;
+
+  return bytes;
+}
+
+/// Total number of flops computed
+int64_t Rank2KOperationProfiler::RankKProblem::flops(library::RankKDescription const &operation_desc) const {
+
+  // FLOPs = 2 * n(n+1)k/2 [mma1] + 2 * n(n+1)k/2 [mma2] + 2 * n(n+1)/2 [epilogue]
+  // FLOPs = n(n+1)(2k + 1)
+  int64_t flops_ = n * (n + 1) * (2*k + 1);
+
+  // complex-valued support
+  switch (operation_desc.tile_description.math_instruction.math_operation) {
+  case library::MathOperationID::kMultiplyAddComplex:
+    flops_ *= 4;
+    break;
+
+  case library::MathOperationID::kMultiplyAddComplexFastF32:
+    flops_ *= 4;
+    break;
+
+  case library::MathOperationID::kMultiplyAddGaussianComplex:
+    flops_ *= 3;
+    break;
+
+  default: break;
+  }
+
+  return flops_;
+}
+
+/// Initializes a performance result
+void Rank2KOperationProfiler::RankKProblem::initialize_result(
+  PerformanceResult &result,
+  library::RankKDescription const &operation_desc,
+  ProblemSpace const &problem_space) {
+
+  result.arguments.resize(problem_space.rank());
+
+  set_argument(result, "rank_k_kind", problem_space, library::to_string(operation_desc.rank_k_kind));
+
+  set_argument(result, "A", problem_space,
+    std::string(library::to_string(operation_desc.A.element)) + ":" + library::to_string(operation_desc.A.layout));
+
+  set_argument(result, "B", problem_space,
+    std::string(library::to_string(operation_desc.B.element)) + ":" + library::to_string(operation_desc.B.layout));
+
+  set_argument(result, "C", problem_space,
+    std::string(library::to_string(operation_desc.C.element)) + ":" + library::to_string(operation_desc.C.layout));
+
+  set_argument(result, "fill_mode", problem_space, library::to_string(operation_desc.fill_mode));
+
+  set_argument(result, "blas_mode", problem_space, library::to_string(operation_desc.blas_mode));
+
+  set_argument(result, "n", problem_space, n);
+  set_argument(result, "k", problem_space, k);
+
+  set_argument(result, "split_k_slices", problem_space, split_k_slices);
+  set_argument(result, "batch_count", problem_space, batch_count);
+
+  set_argument(result, "alpha", problem_space,
+    library::lexical_cast(alpha, operation_desc.element_epilogue));
+
+  set_argument(result, "beta", problem_space,
+    library::lexical_cast(beta, operation_desc.element_epilogue));
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Extracts the problem dimensions
+Status Rank2KOperationProfiler::initialize_configuration(
+  Options const &options,  
+  PerformanceReport &report,
+  DeviceContext &device_context,
+  library::Operation const *operation,
+  ProblemSpace const &problem_space,
+  ProblemSpace::Problem const &problem) {
+
+  library::RankKDescription const &operation_desc = 
+    static_cast<library::RankKDescription const &>(operation->description());
+
+  if (operation_desc.rank_k_kind != library::RankKKind::kUniversal) {
+    return Status::kErrorInvalidProblem;
+  }
+
+  Status status = problem_.parse(operation_desc, problem_space, problem);
+  
+  if (status != Status::kSuccess) {
+    return status;
+  }
+
+  rank_k_workspace_.configuration.problem_size.m() = int(problem_.n);
+  rank_k_workspace_.configuration.problem_size.n() = int(problem_.n);
+  rank_k_workspace_.configuration.problem_size.k() = int(problem_.k);
+  rank_k_workspace_.configuration.lda = problem_.lda;
+  rank_k_workspace_.configuration.ldb = problem_.ldb;
+  rank_k_workspace_.configuration.ldc = problem_.ldc;
+  rank_k_workspace_.configuration.ldd = problem_.ldc;
+  //rank_k_workspace_.configuration.split_k_slices = int(problem_.split_k_slices);
+  rank_k_workspace_.configuration.batch_count = int(problem_.split_k_slices);
+
+  rank_k_workspace_.arguments.A = nullptr;
+  rank_k_workspace_.arguments.B = nullptr;
+  rank_k_workspace_.arguments.C = nullptr;
+  rank_k_workspace_.arguments.D = nullptr;
+  rank_k_workspace_.arguments.alpha = problem_.alpha.data();
+  rank_k_workspace_.arguments.beta = problem_.beta.data();
+  rank_k_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
+
+  initialize_result_(this->model_result_, options, operation_desc, problem_space);
+  
+  return operation->can_implement(&rank_k_workspace_.configuration, &rank_k_workspace_.arguments);
+}
+
+/// Initializes the performance result
+void Rank2KOperationProfiler::initialize_result_(
+  PerformanceResult &result,
+  Options const &options,  
+  library::RankKDescription const &operation_desc,
+  ProblemSpace const &problem_space) {
+
+  result.provider = library::Provider::kCUTLASS;
+  result.disposition = Disposition::kNotRun;
+  result.status = Status::kSuccess;
+  result.operation_name = operation_desc.name;
+  
+  problem_.initialize_result(result, operation_desc, problem_space);
+
+  OperationProfiler::initialize_result_(result, operation_desc, problem_space);
+
+
+  result.bytes = problem_.bytes(operation_desc);
+  result.flops = problem_.flops(operation_desc);
+  result.runtime = 0;
+
+
+}
+
+/// Initializes workspace
+Status Rank2KOperationProfiler::initialize_workspace(
+  Options const &options,  
+  PerformanceReport &report,
+  DeviceContext &device_context,
+  library::Operation const *operation,
+  ProblemSpace const &problem_space,
+  ProblemSpace::Problem const &problem) {
+  
+  library::RankKDescription const &operation_desc = 
+    static_cast<library::RankKDescription const &>(operation->description());
+
+  if (options.execution_mode != ExecutionMode::kDryRun) {
+
+    rank_k_workspace_.A = device_context.allocate_tensor(
+      options,
+      "A",
+      operation_desc.A.element,
+      operation_desc.A.layout,
+      {int(problem_.n), int(problem_.k)},
+      {int(problem_.lda)}
+    );
+
+    rank_k_workspace_.B = device_context.allocate_tensor(
+      options,
+      "B",
+      operation_desc.B.element,
+      operation_desc.B.layout,
+      {int(problem_.n), int(problem_.k)},
+      {int(problem_.ldb)}
+    );
+
+    rank_k_workspace_.C = device_context.allocate_tensor(
+      options,
+      "C",
+      operation_desc.C.element,
+      operation_desc.C.layout,
+      {int(problem_.n), int(problem_.n)},
+      {int(problem_.ldc)},
+      1 // batch_count = 1, default
+    );
+
+    rank_k_workspace_.Computed = device_context.allocate_tensor(
+      "D",
+      operation_desc.C.element,
+      operation_desc.C.layout,
+      {int(problem_.n), int(problem_.n)},
+      {int(problem_.ldc)}
+    );
+
+    rank_k_workspace_.Reference = device_context.allocate_tensor(
+      "Reference",
+      operation_desc.C.element,
+      operation_desc.C.layout,
+      {int(problem_.n), int(problem_.n)},
+      {int(problem_.ldc)}
+    );
+
+    rank_k_workspace_.Computed->copy_from_device(rank_k_workspace_.C->data());
+    rank_k_workspace_.Reference->copy_from_device(rank_k_workspace_.C->data());
+  }
+
+
+  //
+  // Initialize the CUTLASS operation
+  //
+  Status status = Status::kSuccess;
+
+  if (options.profiling.provider_enabled(library::Provider::kCUTLASS)) {
+
+    if (options.execution_mode != ExecutionMode::kDryRun) {
+
+      uint64_t workspace_size = operation->get_host_workspace_size(&rank_k_workspace_.configuration);
+      rank_k_workspace_.host_workspace.resize(workspace_size, 0);
+
+      workspace_size = operation->get_device_workspace_size(&rank_k_workspace_.configuration);
+      rank_k_workspace_.device_workspace.reset(library::NumericTypeID::kU8, workspace_size);
+
+      status = operation->initialize(
+        &rank_k_workspace_.configuration,
+        rank_k_workspace_.host_workspace.data(),
+        rank_k_workspace_.device_workspace.data());
+    }
+
+    //
+    // If CUTLASS is enabled, generate a result for it
+    //
+    results_.push_back(model_result_);
+    results_.back().provider = library::Provider::kCUTLASS;
+    results_.back().op_kind = library::OperationKind::kRank2K;
+    results_.back().disposition = Disposition::kNotRun;
+
+    for(auto provider : verification_providers_) {
+      results_.back().verification_map[provider] = Disposition::kNotRun;
+    }
+  }
+
+  return status;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Verifies CUTLASS against references
+bool Rank2KOperationProfiler::verify_cutlass(
+  Options const &options,  
+  PerformanceReport &report,
+  DeviceContext &device_context,
+  library::Operation const *operation,
+  ProblemSpace const &problem_space,
+  ProblemSpace::Problem const &problem) {
+
+  if (!options.profiling.provider_enabled(library::Provider::kCUTLASS)) {
+    return true;
+  }
+
+  if (options.execution_mode == ExecutionMode::kDryRun) {
+    return true;
+  }
+
+  // Initialize structure containing RankK arguments
+  rank_k_workspace_.arguments.A = rank_k_workspace_.A->data();
+  rank_k_workspace_.arguments.B = rank_k_workspace_.B->data();
+  rank_k_workspace_.arguments.C = rank_k_workspace_.C->data();
+  rank_k_workspace_.arguments.D = rank_k_workspace_.Computed->data();
+  rank_k_workspace_.arguments.alpha = problem_.alpha.data();
+  rank_k_workspace_.arguments.beta = problem_.beta.data();
+  rank_k_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
+
+  //
+  // Run the CUTLASS operation
+  //
+
+  results_.back().status = operation->run(
+    &rank_k_workspace_.arguments, 
+    rank_k_workspace_.host_workspace.data(),
+    rank_k_workspace_.device_workspace.data());
+
+  if (results_.back().status != Status::kSuccess) {
+    results_.back().disposition = Disposition::kFailed;
+    return false;
+  }
+
+  cudaError_t result = cudaDeviceSynchronize();
+  if (result != cudaSuccess) {
+    results_.back().disposition = Disposition::kFailed;
+    return false;
+  }
+
+  // CUTLASS op ran the but not yet verified against any verification provider
+  results_.back().disposition = Disposition::kNotVerified;
+
+  //
+  // Run verification providers
+  //
+
+  if (options.verification.enabled) {
+
+#if CUTLASS_ENABLE_CUBLAS
+    if (options.verification.provider_enabled(library::Provider::kCUBLAS)) {
+
+      // Guard against unsupported cases
+      auto const & rank_k_desc = static_cast<library::RankKDescription const &>(operation->description());
+
+      if (cublas_satisfies(rank_k_desc) == Status::kSuccess) {
+
+        // call cublas verification if supported
+        verify_with_cublas_(
+          options,
+          report,
+          device_context,
+          operation,
+          problem_space,
+          problem);
+        }
+
+      else {
+        // set verification map for cublas to not supported
+        results_.back().verification_map[library::Provider::kCUBLAS] = Disposition::kNotSupported;
+      }
+    }
+#endif // #if CUTLASS_ENABLE_CUBLAS
+    
+    // Update disposition to worst case verification outcome among all 
+    // verification providers which are supported
+    bool is_any_verification_run_passed = false;
+    for(auto &m : results_.back().verification_map) {
+      if(m.second == Disposition::kFailed || m.second == Disposition::kIncorrect) {
+        results_.back().disposition = m.second;
+        return true;
+      }
+      if(!is_any_verification_run_passed && m.second == Disposition::kPassed) {
+        is_any_verification_run_passed = true;
+      }
+    }
+
+    if(is_any_verification_run_passed) {
+      results_.back().disposition = Disposition::kPassed;
+    }
+  }
+
+  // Return true means continue profiling
+  return true;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Verifies CUTLASS against references
+bool Rank2KOperationProfiler::verify_with_cublas_(
+  Options const &options,  
+  PerformanceReport &report,
+  DeviceContext &device_context,
+  library::Operation const *operation,
+  ProblemSpace const &problem_space,
+  ProblemSpace::Problem const &problem) {
+
+
+#if CUTLASS_ENABLE_CUBLAS
+
+  library::RankKDescription const &rank_k_desc = 
+    static_cast<library::RankKDescription const &>(operation->description());
+
+  //
+  // Construct cuBLAS operators
+  //
+    
+  CublasCreate handle;
+  cublasStatus_t status = handle.get_cublas_create_status();
+
+  if (status != CUBLAS_STATUS_SUCCESS) {
+
+    results_.back().verification_map[library::Provider::kCUBLAS] = Disposition::kFailed;
+    return true;
+  }
+
+  //
+  // Initialize state
+  //
+
+  try {
+
+    //
+    // Construct dispatcher to cublas<t>Syr2k()
+    //
+
+    // Initialize structure containing RankK arguments
+    rank_k_workspace_.arguments.A = rank_k_workspace_.A->data();
+    rank_k_workspace_.arguments.B = rank_k_workspace_.B->data();
+    rank_k_workspace_.arguments.C = rank_k_workspace_.Reference->data();
+    rank_k_workspace_.arguments.D = rank_k_workspace_.Reference->data();
+    rank_k_workspace_.arguments.alpha = problem_.alpha.data();
+    rank_k_workspace_.arguments.beta = problem_.beta.data();
+    rank_k_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
+
+    detail::cublasRankKDispatcher rank_k_op( 
+      rank_k_desc, 
+      rank_k_workspace_.configuration,
+      rank_k_workspace_.arguments
+    );
+
+    if (rank_k_op.status != Status::kSuccess) {
+      results_.back().verification_map[library::Provider::kCUBLAS] = Disposition::kNotRun;
+      return true;
+    }
+
+    results_.back().status = Status::kSuccess;
+
+    status = rank_k_op(handle);
+
+    // Handle errors
+    if (status != CUBLAS_STATUS_SUCCESS) {
+
+      results_.back().verification_map[library::Provider::kCUBLAS] = Disposition::kFailed;
+      return true;
+    }
+
+    //
+    // Verify results
+    //
+
+    results_.back().verification_map[library::Provider::kCUBLAS] = compare_tensors(
+      options,
+      *rank_k_workspace_.Computed,
+      *rank_k_workspace_.Reference
+    );
+
+    // Save workspace if incorrect
+    if (options.verification.save_workspace == SaveWorkspace::kIncorrect && 
+      results_.back().verification_map[library::Provider::kCUBLAS] == Disposition::kIncorrect) {
+
+      save_workspace(
+        device_context,
+        options,
+        rank_k_desc,
+        library::Provider::kCUTLASS,
+        library::Provider::kCUBLAS);
+    }
+  }
+  catch (...) {
+    results_.back().verification_map[library::Provider::kCUBLAS] = Disposition::kFailed;
+  }
+
+#endif
+
+  // Return true means continue profiling
+  return true;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Measures performance results
+bool Rank2KOperationProfiler::profile(
+  Options const &options,  
+  PerformanceReport &report,
+  DeviceContext &device_context,
+  library::Operation const *operation,
+  ProblemSpace const &problem_space,
+  ProblemSpace::Problem const &problem) {
+
+  if (options.profiling.provider_enabled(library::Provider::kCUTLASS)) {
+
+    // Initialize structure containing RankK arguments
+    rank_k_workspace_.arguments.A = rank_k_workspace_.A->data();
+    rank_k_workspace_.arguments.B = rank_k_workspace_.B->data();
+    rank_k_workspace_.arguments.C = rank_k_workspace_.C->data();
+    rank_k_workspace_.arguments.D = rank_k_workspace_.Computed->data();
+    rank_k_workspace_.arguments.alpha = problem_.alpha.data();
+    rank_k_workspace_.arguments.beta = problem_.beta.data();
+    rank_k_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
+
+    results_.back().status = profile_cutlass_(
+      results_.back().runtime,
+      options,
+      operation,
+      &rank_k_workspace_.arguments,
+      rank_k_workspace_.host_workspace.data(),
+      rank_k_workspace_.device_workspace.data()
+    );
+  }
+  return true;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace profiler
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tools/profiler/src/rank_2k_operation_profiler.h b/tools/profiler/src/rank_2k_operation_profiler.h
new file mode 100644
index 00000000..e4c62e80
--- /dev/null
+++ b/tools/profiler/src/rank_2k_operation_profiler.h
@@ -0,0 +1,229 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Defines a math function
+
+  
+*/
+
+#pragma once
+
+#include <vector>
+#include <string>
+#include <memory>
+#include <algorithm>
+#include <unordered_map>
+
+// CUTLASS Library includes
+#include "cutlass/blas3.h"
+#include "cutlass/library/library.h"
+#include "cutlass/library/util.h"
+#include "cutlass/library/manifest.h"
+
+// Profiler includes
+#include "options.h"
+#include "device_context.h"
+#include "operation_profiler.h"
+#include "performance_result.h"
+#include "problem_space.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace profiler {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+/// Abstract base class for each math function
+class Rank2KOperationProfiler : public OperationProfiler {
+public:
+
+  /// Problem structure obtained from problem space
+  struct RankKProblem {
+    int64_t n;
+    int64_t k;
+    int64_t lda;
+    int64_t ldb;
+    int64_t ldc;
+    FillMode fill_mode;
+    BlasMode blas_mode;
+    std::vector<uint8_t> alpha;
+    std::vector<uint8_t> beta;
+    int64_t split_k_slices;
+    int64_t batch_count;
+
+    //
+    // Methods
+    //
+
+    RankKProblem(): 
+      n(16), k(16), lda(0), ldc(0), 
+      fill_mode(FillMode::kInvalid), blas_mode(BlasMode::kInvalid), 
+      split_k_slices(1), batch_count(1) { }
+
+    /// Parses the problem
+    Status parse(
+      library::RankKDescription const &operation_desc,
+      ProblemSpace const &problem_space,
+      ProblemSpace::Problem const &problem);
+
+    /// Total number of bytes loaded
+    int64_t bytes(library::RankKDescription const &operation_desc) const;
+
+    /// Total number of flops computed
+    int64_t flops(library::RankKDescription const &operation_desc) const;
+    
+    /// Initializes a performance result
+    void initialize_result(
+      PerformanceResult &result,
+      library::RankKDescription const &operation_desc,
+      ProblemSpace const &problem_space);
+  };
+
+  /// Workspace used 
+  struct RankKWorkspace {
+
+    DeviceAllocation *A;
+    DeviceAllocation *B;
+    DeviceAllocation *C;
+    DeviceAllocation *Computed;
+    DeviceAllocation *Reference;
+
+    library::RankKConfiguration configuration;
+    library::RankKArguments arguments;
+
+    /// Buffer used for the operation's host workspace
+    std::vector<uint8_t> host_workspace;
+
+    /// Buffer used for the operations' device workspace
+    DeviceAllocation device_workspace;
+
+    //
+    // Methods
+    //
+
+    RankKWorkspace(): 
+      A(nullptr), B(nullptr), C(nullptr), Computed(nullptr), Reference(nullptr) { }
+  };
+
+protected:
+
+  //
+  // Data members
+  //
+
+  /// GEMM problem obtained from problem space
+  RankKProblem problem_;
+
+  /// Device memory allocations 
+  RankKWorkspace rank_k_workspace_;
+
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  Rank2KOperationProfiler(Options const &options);
+
+  /// Destructor
+  virtual ~Rank2KOperationProfiler();
+
+  /// Prints usage statement for the math function
+  virtual void print_usage(std::ostream &out) const;
+
+  /// Prints examples
+  virtual void print_examples(std::ostream &out) const;
+
+  /// Extracts the problem dimensions
+  virtual Status initialize_configuration(
+    Options const &options, 
+    PerformanceReport &report, 
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Initializes workspace
+  virtual Status initialize_workspace(
+    Options const &options, 
+    PerformanceReport &report, 
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Verifies CUTLASS against references
+  virtual bool verify_cutlass(
+    Options const &options,  
+    PerformanceReport &report,
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Measures performance results
+  virtual bool profile(
+    Options const &options, 
+    PerformanceReport &report, 
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+protected:
+
+  /// Initializes the performance result
+  void initialize_result_(
+    PerformanceResult &result,
+    Options const &options,  
+    library::RankKDescription const &operation_desc,
+    ProblemSpace const &problem_space);
+
+  /// Verifies CUTLASS against references
+  bool verify_with_cublas_(
+    Options const &options,  
+    PerformanceReport &report,
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace profiler
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tools/profiler/src/rank_k_operation_profiler.cu b/tools/profiler/src/rank_k_operation_profiler.cu
new file mode 100644
index 00000000..8546a5b2
--- /dev/null
+++ b/tools/profiler/src/rank_k_operation_profiler.cu
@@ -0,0 +1,715 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Execution environment
+
+  
+*/
+
+#include <iostream>
+#include <stdexcept>
+#include <iomanip>
+#include <ios>
+
+#include "cutlass/core_io.h"
+
+#include "cublas_helpers.h"
+#include "rank_k_operation_profiler.h"
+#include "gpu_timer.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace profiler {
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Ctor
+RankKOperationProfiler::RankKOperationProfiler(Options const &options): 
+  OperationProfiler(
+    options,
+    library::OperationKind::kRankK,
+    {
+      {ArgumentTypeID::kEnumerated, {"rank_k_kind"}, "Variant of RankK (universal)"},
+      {ArgumentTypeID::kInteger, {"n", "problem-size::n"}, "N dimension of the RankK problem space"},
+      {ArgumentTypeID::kInteger, {"k", "problem-size::k"}, "K dimension of the RankK problem space"},
+      {ArgumentTypeID::kTensor, {"A"}, "Tensor storing the A operand"},
+      {ArgumentTypeID::kTensor, {"C"}, "Tensor storing the C operand"},
+      {ArgumentTypeID::kEnumerated, {"fill_mode"}, "Fill Mode for RankK kernel (lower or upper)"},
+      {ArgumentTypeID::kEnumerated, {"blas_mode"}, "Blas Mode for RankK kernel (symmetric or hermitian)"},
+      {ArgumentTypeID::kScalar, {"alpha", "epilogue::alpha"}, "Epilogue scalar alpha"},
+      {ArgumentTypeID::kScalar, {"beta", "epilogue::beta"}, "Epilogue scalar beta"},
+      {ArgumentTypeID::kInteger, {"split_k_slices", "split-k-slices"}, "Number of partitions of K dimension"},
+      {ArgumentTypeID::kInteger, {"batch_count", "batch-count"}, "Number of RankK computed in one batch"},
+    },
+    { library::Provider::kCUBLAS}
+  ) {
+  description_ = "      Rank-k Update. D = alpha * A*A^T + beta * C (symmetric) or D = alpha * A*A^H + beta * C (hermitian)";
+}
+
+/// Destructor
+RankKOperationProfiler::~RankKOperationProfiler() {
+
+}
+
+/// Prints usage statement for the math function
+void RankKOperationProfiler::print_usage(std::ostream &out) const {
+  out << "RankK" << "\n\n";
+
+  OperationProfiler::print_usage(out);
+}
+
+/// Prints examples
+void RankKOperationProfiler::print_examples(std::ostream &out) const {
+
+  out << "\nExamples:\n\n"
+    << "Profile a particular problem size Syrk kernel:\n"
+    << "  $ cutlass_profiler --operation=rank_k --blas_mode=symmetric --n=1024 --k=128\n\n"
+    
+    << "Profile a particular problem size Herk kernel:\n"
+    << "  $ cutlass_profiler --operation=rank_k --blas_mode=hermitian --n=1024 --k=128\n\n"
+
+    << "Schmoo over problem size and beta:\n"
+    << "  $ cutlass_profiler --operation=rank_k --n=1024:4096:256 --k=128:8192:128 --beta=0,1,2.5\n\n"
+
+    << "Schmoo over accumulator types:\n"
+    << "  $ cutlass_profiler --operation=rank_k --accumulator-type=f16,f32\n\n"
+
+    << "Schmoo over fill modees:\n"
+    << "  $ cutlass_profiler --operation=rank_k --fill_mode=lower/upper\n\n"
+
+    << "Run when A is f16 with column-major or A is any datatype with row-major (For column major, use column, col, or n. For row major use, row or t):\n"
+    << "  $ cutlass_profiler --operation=rank_k --A=f16:column or --A=*:row\n\n"
+
+    << "Using various input value distribution:\n"
+    << "  $ cutlass_profiler --operation=rank_k --dist=uniform,min:0,max:3\n"
+    << "  $ cutlass_profiler --operation=rank_k --dist=gaussian,mean:0,stddev:3\n"
+    << "  $ cutlass_profiler --operation=rank_k --dist=sequential,start:0,delta:1\n\n"
+
+    << "Run a kernel with cta tile size of 256x128x32 and save workspace if results are incorrect (note that --cta-tile::k=32 is default cta-tile size):\n"
+    << " $ cutlass_profiler --operation=rank_k --cta_m=256 --cta_n=128  --cta_k=32 --save-workspace=incorrect\n\n"
+    
+    << "Test your changes to rank_k kernels with a quick functional test and save results in functional-test.csv:\n"
+    << " $ cutlass_profiler  --operation=rank_k \\ \n"
+    << "   --n=8,56,120,136,256,264,512,520,1024,1032,4096,8192,16384 \\ \n"
+    << "   --k=8,16,32,64,128,256,288,384,504,512,520 \\ \n"
+    << "   --beta=0,1,2 --profiling-iterations=1 \\ \n"
+    << "   --providers=cutlass --output=functional-test.csv\n\n";
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if 0
+// used this for debugging
+static std::string byte_string(std::vector<uint8_t> const &bytes) {
+  std::stringstream ss;
+
+  ss << "0x";
+
+  for (size_t idx = bytes.size(); idx > 0; --idx) {
+    ss << std::hex << std::setw(2) << std::setfill('0') << uint32_t(bytes.at(idx - 1));
+  }
+
+  return ss.str();
+}
+#endif
+
+Status RankKOperationProfiler::RankKProblem::parse(
+  library::RankKDescription const &operation_desc,
+  ProblemSpace const &problem_space,
+  ProblemSpace::Problem const &problem) {
+  
+  if (!arg_as_int(this->n, "n", problem_space, problem)) {
+    // default value
+    this->n = 1024;
+  }
+  
+  if (!arg_as_int(this->k, "k", problem_space, problem)) {
+    // default value
+    this->k = 1024;
+  }
+  
+  if (!arg_as_int(this->split_k_slices, "split_k_slices", problem_space, problem)) {
+    // default value
+    this->split_k_slices = 1;
+  }
+  
+  if (!arg_as_int(this->batch_count, "batch_count", problem_space, problem)) {
+    // default value
+    this->batch_count = 1;
+  }
+
+  if (this->split_k_slices > 1 && this->batch_count > 1) {
+    // At least one of these must be one
+    return Status::kErrorInvalidProblem;
+  }
+
+  if (!tensor_description_satisfies(operation_desc.A, "A", problem_space, problem)) {
+    return Status::kErrorInvalidProblem;
+  }
+
+  if (!tensor_description_satisfies(operation_desc.C, "C", problem_space, problem)) {
+    return Status::kErrorInvalidProblem;
+  }
+
+  if (!arg_as_scalar(
+    this->alpha, 
+    operation_desc.element_epilogue, 
+    "alpha", 
+    problem_space, 
+    problem)) {
+
+    if (!cast_from_double(this->alpha, operation_desc.element_epilogue, 1)) {
+      return Status::kErrorInternal;
+    }
+  }
+  
+  if (!arg_as_scalar(
+    this->beta, 
+    operation_desc.element_epilogue, 
+    "beta", 
+    problem_space, 
+    problem)) {
+    
+    if (!cast_from_double(this->beta, operation_desc.element_epilogue, 0)) {
+      return Status::kErrorInternal;
+    }
+  }
+  
+  this->lda = DeviceAllocation::get_packed_layout(
+    operation_desc.A.layout, {int(this->n), int(this->k)}).front();
+
+  this->ldc = DeviceAllocation::get_packed_layout(
+    operation_desc.C.layout, {int(this->n), int(this->n)}).front();
+
+  return Status::kSuccess;
+}
+
+/// Total number of bytes loaded
+int64_t RankKOperationProfiler::RankKProblem::bytes(library::RankKDescription const &operation_desc) const {
+  // Input bytes read and Output bytes written for the gemm problem
+  int64_t bytes =
+    int64_t(library::sizeof_bits(operation_desc.A.element) * n / 8) * k +
+    int64_t(library::sizeof_bits(operation_desc.A.element) * n / 8) * k +
+    // Half matrix including the diagonal will have (N*(N+1))/2 elements
+    int64_t(library::sizeof_bits(operation_desc.C.element) * n / 8) * (n+1) / 2;
+
+  // Set is_beta_zero true if beta is zero
+  bool is_beta_zero = std::all_of(beta.begin(), beta.end(), [](uint8_t i) { return i==0; });
+
+  // Output bytes read for the gemm problem for non-zero beta values
+  if (!is_beta_zero) {
+    bytes += int64_t(library::sizeof_bits(operation_desc.C.element) * n / 8) * (n+1) / 2;
+  }
+
+  bytes *= batch_count;
+
+  return bytes;
+}
+
+/// Total number of flops computed
+int64_t RankKOperationProfiler::RankKProblem::flops(library::RankKDescription const &operation_desc) const {
+
+  // FLOPs = 2 * n(n+1)k/2 [mma] + 2 * n(n+1)/2 [epilogue]
+  // FLOPs = n(n+1)(k + 1)
+  int64_t flops_ = n * (n + 1) * (k + 1);
+
+  // complex-valued support
+  switch (operation_desc.tile_description.math_instruction.math_operation) {
+  case library::MathOperationID::kMultiplyAddComplex:
+    flops_ *= 4;
+    break;
+
+  case library::MathOperationID::kMultiplyAddComplexFastF32:
+    flops_ *= 4;
+    break;
+    
+  case library::MathOperationID::kMultiplyAddGaussianComplex:
+    flops_ *= 3;
+    break;
+
+  default: break;
+  }
+
+  return flops_;
+}
+
+/// Initializes a performance result
+void RankKOperationProfiler::RankKProblem::initialize_result(
+  PerformanceResult &result,
+  library::RankKDescription const &operation_desc,
+  ProblemSpace const &problem_space) {
+
+  result.arguments.resize(problem_space.rank());
+
+  set_argument(result, "rank_k_kind", problem_space, library::to_string(operation_desc.rank_k_kind));
+
+  set_argument(result, "A", problem_space,
+    std::string(library::to_string(operation_desc.A.element)) + ":" + library::to_string(operation_desc.A.layout));
+
+  set_argument(result, "C", problem_space,
+    std::string(library::to_string(operation_desc.C.element)) + ":" + library::to_string(operation_desc.C.layout));
+
+  set_argument(result, "fill_mode", problem_space, library::to_string(operation_desc.fill_mode));
+
+  set_argument(result, "blas_mode", problem_space, library::to_string(operation_desc.blas_mode));
+
+  set_argument(result, "n", problem_space, n);
+  set_argument(result, "k", problem_space, k);
+
+  set_argument(result, "split_k_slices", problem_space, split_k_slices);
+  set_argument(result, "batch_count", problem_space, batch_count);
+
+  set_argument(result, "alpha", problem_space,
+    library::lexical_cast(alpha, operation_desc.element_epilogue));
+
+  set_argument(result, "beta", problem_space,
+    library::lexical_cast(beta, operation_desc.element_epilogue));
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Extracts the problem dimensions
+Status RankKOperationProfiler::initialize_configuration(
+  Options const &options,  
+  PerformanceReport &report,
+  DeviceContext &device_context,
+  library::Operation const *operation,
+  ProblemSpace const &problem_space,
+  ProblemSpace::Problem const &problem) {
+
+  library::RankKDescription const &operation_desc = 
+    static_cast<library::RankKDescription const &>(operation->description());
+
+  if (operation_desc.rank_k_kind != library::RankKKind::kUniversal) {
+    return Status::kErrorInvalidProblem;
+  }
+
+  Status status = problem_.parse(operation_desc, problem_space, problem);
+  
+  if (status != Status::kSuccess) {
+    return status;
+  }
+
+  rank_k_workspace_.configuration.problem_size.m() = int(problem_.n);
+  rank_k_workspace_.configuration.problem_size.n() = int(problem_.n);
+  rank_k_workspace_.configuration.problem_size.k() = int(problem_.k);
+  rank_k_workspace_.configuration.lda = problem_.lda;
+  rank_k_workspace_.configuration.ldc = problem_.ldc;
+  rank_k_workspace_.configuration.ldd = problem_.ldc;
+  //rank_k_workspace_.configuration.split_k_slices = int(problem_.split_k_slices);
+  rank_k_workspace_.configuration.batch_count = int(problem_.split_k_slices);
+
+  rank_k_workspace_.arguments.A = nullptr;
+  rank_k_workspace_.arguments.C = nullptr;
+  rank_k_workspace_.arguments.D = nullptr;
+  rank_k_workspace_.arguments.alpha = problem_.alpha.data();
+  rank_k_workspace_.arguments.beta = problem_.beta.data();
+  rank_k_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
+
+  initialize_result_(this->model_result_, options, operation_desc, problem_space);
+  
+  return operation->can_implement(&rank_k_workspace_.configuration, &rank_k_workspace_.arguments);
+}
+
+/// Initializes the performance result
+void RankKOperationProfiler::initialize_result_(
+  PerformanceResult &result,
+  Options const &options,  
+  library::RankKDescription const &operation_desc,
+  ProblemSpace const &problem_space) {
+
+  result.provider = library::Provider::kCUTLASS;
+  result.disposition = Disposition::kNotRun;
+  result.status = Status::kSuccess;
+  result.operation_name = operation_desc.name;
+  
+  problem_.initialize_result(result, operation_desc, problem_space);
+
+  OperationProfiler::initialize_result_(result, operation_desc, problem_space);
+
+
+  result.bytes = problem_.bytes(operation_desc);
+  result.flops = problem_.flops(operation_desc);
+
+  result.runtime = 0;
+
+  // complex-valued support
+  switch (operation_desc.tile_description.math_instruction.math_operation) {
+  case library::MathOperationID::kMultiplyAddComplex:
+    result.flops *= 4;
+    break;
+     
+  case library::MathOperationID::kMultiplyAddComplexFastF32:
+    result.flops *= 4;
+    break;
+
+  default: break;
+  }
+
+}
+
+/// Initializes workspace
+Status RankKOperationProfiler::initialize_workspace(
+  Options const &options,  
+  PerformanceReport &report,
+  DeviceContext &device_context,
+  library::Operation const *operation,
+  ProblemSpace const &problem_space,
+  ProblemSpace::Problem const &problem) {
+  
+  library::RankKDescription const &operation_desc = 
+    static_cast<library::RankKDescription const &>(operation->description());
+
+  if (options.execution_mode != ExecutionMode::kDryRun) {
+
+    rank_k_workspace_.A = device_context.allocate_tensor(
+      options,
+      "A",
+      operation_desc.A.element,
+      operation_desc.A.layout,
+      {int(problem_.n), int(problem_.k)},
+      {int(problem_.lda)}
+    );
+
+    rank_k_workspace_.C = device_context.allocate_tensor(
+      options,
+      "C",
+      operation_desc.C.element,
+      operation_desc.C.layout,
+      {int(problem_.n), int(problem_.n)},
+      {int(problem_.ldc)},
+      1 // batch_count = 1, default
+    );
+
+    rank_k_workspace_.Computed = device_context.allocate_tensor(
+      "D",
+      operation_desc.C.element,
+      operation_desc.C.layout,
+      {int(problem_.n), int(problem_.n)},
+      {int(problem_.ldc)}
+    );
+
+    rank_k_workspace_.Reference = device_context.allocate_tensor(
+      "Reference",
+      operation_desc.C.element,
+      operation_desc.C.layout,
+      {int(problem_.n), int(problem_.n)},
+      {int(problem_.ldc)}
+    );
+
+    rank_k_workspace_.Computed->copy_from_device(rank_k_workspace_.C->data());
+    rank_k_workspace_.Reference->copy_from_device(rank_k_workspace_.C->data());
+  }
+
+
+  //
+  // Initialize the CUTLASS operation
+  //
+  Status status = Status::kSuccess;
+
+  if (options.profiling.provider_enabled(library::Provider::kCUTLASS)) {
+
+    if (options.execution_mode != ExecutionMode::kDryRun) {
+
+      uint64_t workspace_size = operation->get_host_workspace_size(&rank_k_workspace_.configuration);
+      rank_k_workspace_.host_workspace.resize(workspace_size, 0);
+
+      workspace_size = operation->get_device_workspace_size(&rank_k_workspace_.configuration);
+      rank_k_workspace_.device_workspace.reset(library::NumericTypeID::kU8, workspace_size);
+
+      status = operation->initialize(
+        &rank_k_workspace_.configuration,
+        rank_k_workspace_.host_workspace.data(),
+        rank_k_workspace_.device_workspace.data());
+    }
+
+    //
+    // If CUTLASS is enabled, generate a result for it
+    //
+    results_.push_back(model_result_);
+    results_.back().provider = library::Provider::kCUTLASS;
+    results_.back().op_kind = library::OperationKind::kRankK;
+    results_.back().disposition = Disposition::kNotRun;
+
+    for(auto provider : verification_providers_) {
+      results_.back().verification_map[provider] = Disposition::kNotRun;
+    }
+  }
+
+  return status;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Verifies CUTLASS against references
+bool RankKOperationProfiler::verify_cutlass(
+  Options const &options,  
+  PerformanceReport &report,
+  DeviceContext &device_context,
+  library::Operation const *operation,
+  ProblemSpace const &problem_space,
+  ProblemSpace::Problem const &problem) {
+
+  if (!options.profiling.provider_enabled(library::Provider::kCUTLASS)) {
+    return true;
+  }
+
+  if (options.execution_mode == ExecutionMode::kDryRun) {
+    return true;
+  }
+
+  // Initialize structure containing RankK arguments
+  rank_k_workspace_.arguments.A = rank_k_workspace_.A->data();
+  rank_k_workspace_.arguments.C = rank_k_workspace_.C->data();
+  rank_k_workspace_.arguments.D = rank_k_workspace_.Computed->data();
+  rank_k_workspace_.arguments.alpha = problem_.alpha.data();
+  rank_k_workspace_.arguments.beta = problem_.beta.data();
+  rank_k_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
+
+  //
+  // Run the CUTLASS operation
+  //
+
+  results_.back().status = operation->run(
+    &rank_k_workspace_.arguments, 
+    rank_k_workspace_.host_workspace.data(),
+    rank_k_workspace_.device_workspace.data());
+
+  if (results_.back().status != Status::kSuccess) {
+    results_.back().disposition = Disposition::kFailed;
+    return false;
+  }
+
+  cudaError_t result = cudaDeviceSynchronize();
+  if (result != cudaSuccess) {
+    results_.back().disposition = Disposition::kFailed;
+    return false;
+  }
+
+  // CUTLASS op ran the but not yet verified against any verification provider
+  results_.back().disposition = Disposition::kNotVerified;
+
+  //
+  // Run verification providers
+  //
+
+  if (options.verification.enabled) {
+
+#if CUTLASS_ENABLE_CUBLAS
+    if (options.verification.provider_enabled(library::Provider::kCUBLAS)) {
+
+      // Guard against unsupported cases
+      auto const & rank_k_desc = static_cast<library::RankKDescription const &>(operation->description());
+
+      if (cublas_satisfies(rank_k_desc) == Status::kSuccess) {
+
+        // call cublas verification if supported
+        verify_with_cublas_(
+          options,
+          report,
+          device_context,
+          operation,
+          problem_space,
+          problem);
+        }
+
+      else {
+        // set verification map for cublas to not supported
+        results_.back().verification_map[library::Provider::kCUBLAS] = Disposition::kNotSupported;
+      }
+    }
+#endif // #if CUTLASS_ENABLE_CUBLAS
+    
+    // Update disposition to worst case verification outcome among all 
+    // verification providers which are supported
+    bool is_any_verification_run_passed = false;
+    for(auto &m : results_.back().verification_map) {
+      if(m.second == Disposition::kFailed || m.second == Disposition::kIncorrect) {
+        results_.back().disposition = m.second;
+        return true;
+      }
+      if(!is_any_verification_run_passed && m.second == Disposition::kPassed) {
+        is_any_verification_run_passed = true;
+      }
+    }
+
+    if(is_any_verification_run_passed) {
+      results_.back().disposition = Disposition::kPassed;
+    }
+  }
+
+  // Return true means continue profiling
+  return true;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Verifies CUTLASS against references
+bool RankKOperationProfiler::verify_with_cublas_(
+  Options const &options,  
+  PerformanceReport &report,
+  DeviceContext &device_context,
+  library::Operation const *operation,
+  ProblemSpace const &problem_space,
+  ProblemSpace::Problem const &problem) {
+
+
+#if CUTLASS_ENABLE_CUBLAS
+
+  library::RankKDescription const &rank_k_desc = 
+    static_cast<library::RankKDescription const &>(operation->description());
+
+  //
+  // Construct cuBLAS operators
+  //
+    
+  CublasCreate handle;
+  cublasStatus_t status = handle.get_cublas_create_status();
+
+  if (status != CUBLAS_STATUS_SUCCESS) {
+
+    results_.back().verification_map[library::Provider::kCUBLAS] = Disposition::kFailed;
+    return true;
+  }
+
+  //
+  // Initialize state
+  //
+
+  try {
+
+    //
+    // Construct dispatcher to cublas<t>Syrk()
+    //
+
+    // Initialize structure containing RankK arguments
+    rank_k_workspace_.arguments.A = rank_k_workspace_.A->data();
+    rank_k_workspace_.arguments.C = rank_k_workspace_.Reference->data();
+    rank_k_workspace_.arguments.D = rank_k_workspace_.Reference->data();
+    rank_k_workspace_.arguments.alpha = problem_.alpha.data();
+    rank_k_workspace_.arguments.beta = problem_.beta.data();
+    rank_k_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
+
+    detail::cublasRankKDispatcher rank_k_op( 
+      rank_k_desc, 
+      rank_k_workspace_.configuration,
+      rank_k_workspace_.arguments
+    );
+
+    if (rank_k_op.status != Status::kSuccess) {
+      results_.back().verification_map[library::Provider::kCUBLAS] = Disposition::kNotRun;
+      return true;
+    }
+
+    results_.back().status = Status::kSuccess;
+
+    status = rank_k_op(handle);
+
+    // Handle errors
+    if (status != CUBLAS_STATUS_SUCCESS) {
+
+      results_.back().verification_map[library::Provider::kCUBLAS] = Disposition::kFailed;
+      return true;
+    }
+
+    //
+    // Verify results
+    //
+
+    results_.back().verification_map[library::Provider::kCUBLAS] = compare_tensors(
+      options,
+      *rank_k_workspace_.Computed,
+      *rank_k_workspace_.Reference
+    );
+
+    // Save workspace if incorrect
+    if (options.verification.save_workspace == SaveWorkspace::kIncorrect && 
+      results_.back().verification_map[library::Provider::kCUBLAS] == Disposition::kIncorrect) {
+
+      save_workspace(
+        device_context,
+        options,
+        rank_k_desc,
+        library::Provider::kCUTLASS,
+        library::Provider::kCUBLAS);
+    }
+  }
+  catch (...) {
+    results_.back().verification_map[library::Provider::kCUBLAS] = Disposition::kFailed;
+  }
+
+#endif
+
+  // Return true means continue profiling
+  return true;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Measures performance results
+bool RankKOperationProfiler::profile(
+  Options const &options,  
+  PerformanceReport &report,
+  DeviceContext &device_context,
+  library::Operation const *operation,
+  ProblemSpace const &problem_space,
+  ProblemSpace::Problem const &problem) {
+
+  if (options.profiling.provider_enabled(library::Provider::kCUTLASS)) {
+
+    // Initialize structure containing RankK arguments
+    rank_k_workspace_.arguments.A = rank_k_workspace_.A->data();
+    rank_k_workspace_.arguments.C = rank_k_workspace_.C->data();
+    rank_k_workspace_.arguments.D = rank_k_workspace_.Computed->data();
+    rank_k_workspace_.arguments.alpha = problem_.alpha.data();
+    rank_k_workspace_.arguments.beta = problem_.beta.data();
+    rank_k_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
+
+    results_.back().status = profile_cutlass_(
+      results_.back().runtime,
+      options,
+      operation,
+      &rank_k_workspace_.arguments,
+      rank_k_workspace_.host_workspace.data(),
+      rank_k_workspace_.device_workspace.data()
+    );
+  }
+  return true;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace profiler
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tools/profiler/src/rank_k_operation_profiler.h b/tools/profiler/src/rank_k_operation_profiler.h
new file mode 100644
index 00000000..23a9c2ff
--- /dev/null
+++ b/tools/profiler/src/rank_k_operation_profiler.h
@@ -0,0 +1,227 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Defines a math function
+
+  
+*/
+
+#pragma once
+
+#include <vector>
+#include <string>
+#include <memory>
+#include <algorithm>
+#include <unordered_map>
+
+// CUTLASS Library includes
+#include "cutlass/blas3.h"
+#include "cutlass/library/library.h"
+#include "cutlass/library/util.h"
+#include "cutlass/library/manifest.h"
+
+// Profiler includes
+#include "options.h"
+#include "device_context.h"
+#include "operation_profiler.h"
+#include "performance_result.h"
+#include "problem_space.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace profiler {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+/// Abstract base class for each math function
+class RankKOperationProfiler : public OperationProfiler {
+public:
+
+  /// Problem structure obtained from problem space
+  struct RankKProblem {
+    int64_t n;
+    int64_t k;
+    int64_t lda;
+    int64_t ldc;
+    FillMode fill_mode;
+    BlasMode blas_mode;
+    std::vector<uint8_t> alpha;
+    std::vector<uint8_t> beta;
+    int64_t split_k_slices;
+    int64_t batch_count;
+
+    //
+    // Methods
+    //
+
+    RankKProblem(): 
+      n(16), k(16), lda(0), ldc(0), 
+      fill_mode(FillMode::kInvalid), blas_mode(BlasMode::kInvalid), 
+      split_k_slices(1), batch_count(1) { }
+
+    /// Parses the problem
+    Status parse(
+      library::RankKDescription const &operation_desc,
+      ProblemSpace const &problem_space,
+      ProblemSpace::Problem const &problem);
+
+    /// Total number of bytes loaded
+    int64_t bytes(library::RankKDescription const &operation_desc) const;
+
+    /// Total number of flops computed
+    int64_t flops(library::RankKDescription const &operation_desc) const;
+
+    /// Initializes a performance result
+    void initialize_result(
+      PerformanceResult &result,
+      library::RankKDescription const &operation_desc,
+      ProblemSpace const &problem_space);
+  };
+
+  /// Workspace used 
+  struct RankKWorkspace {
+
+    DeviceAllocation *A;
+    DeviceAllocation *C;
+    DeviceAllocation *Computed;
+    DeviceAllocation *Reference;
+
+    library::RankKConfiguration configuration;
+    library::RankKArguments arguments;
+
+    /// Buffer used for the operation's host workspace
+    std::vector<uint8_t> host_workspace;
+
+    /// Buffer used for the operations' device workspace
+    DeviceAllocation device_workspace;
+
+    //
+    // Methods
+    //
+
+    RankKWorkspace(): 
+      A(nullptr), C(nullptr), Computed(nullptr), Reference(nullptr) { }
+  };
+
+protected:
+
+  //
+  // Data members
+  //
+
+  /// GEMM problem obtained from problem space
+  RankKProblem problem_;
+
+  /// Device memory allocations 
+  RankKWorkspace rank_k_workspace_;
+
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  RankKOperationProfiler(Options const &options);
+
+  /// Destructor
+  virtual ~RankKOperationProfiler();
+
+  /// Prints usage statement for the math function
+  virtual void print_usage(std::ostream &out) const;
+
+  /// Prints examples
+  virtual void print_examples(std::ostream &out) const;
+
+  /// Extracts the problem dimensions
+  virtual Status initialize_configuration(
+    Options const &options, 
+    PerformanceReport &report, 
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Initializes workspace
+  virtual Status initialize_workspace(
+    Options const &options, 
+    PerformanceReport &report, 
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Verifies CUTLASS against references
+  virtual bool verify_cutlass(
+    Options const &options,  
+    PerformanceReport &report,
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Measures performance results
+  virtual bool profile(
+    Options const &options, 
+    PerformanceReport &report, 
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+protected:
+
+  /// Initializes the performance result
+  void initialize_result_(
+    PerformanceResult &result,
+    Options const &options,  
+    library::RankKDescription const &operation_desc,
+    ProblemSpace const &problem_space);
+
+  /// Verifies CUTLASS against references
+  bool verify_with_cublas_(
+    Options const &options,  
+    PerformanceReport &report,
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace profiler
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tools/profiler/src/reduction_operation_profiler.h b/tools/profiler/src/reduction_operation_profiler.h
index 98358832..5c340944 100644
--- a/tools/profiler/src/reduction_operation_profiler.h
+++ b/tools/profiler/src/reduction_operation_profiler.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/profiler/src/sparse_gemm_operation_profiler.cu b/tools/profiler/src/sparse_gemm_operation_profiler.cu
index 5171fef5..441a611f 100644
--- a/tools/profiler/src/sparse_gemm_operation_profiler.cu
+++ b/tools/profiler/src/sparse_gemm_operation_profiler.cu
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/profiler/src/sparse_gemm_operation_profiler.h b/tools/profiler/src/sparse_gemm_operation_profiler.h
index db3026ff..1f14fec7 100644
--- a/tools/profiler/src/sparse_gemm_operation_profiler.h
+++ b/tools/profiler/src/sparse_gemm_operation_profiler.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/profiler/src/symm_operation_profiler.cu b/tools/profiler/src/symm_operation_profiler.cu
new file mode 100644
index 00000000..5617f9ae
--- /dev/null
+++ b/tools/profiler/src/symm_operation_profiler.cu
@@ -0,0 +1,764 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Execution environment
+
+  
+*/
+
+#include <iostream>
+#include <stdexcept>
+#include <iomanip>
+#include <ios>
+
+#include "cutlass/core_io.h"
+
+#include "cublas_helpers.h"
+#include "symm_operation_profiler.h"
+#include "gpu_timer.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace profiler {
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Ctor
+SymmOperationProfiler::SymmOperationProfiler(Options const &options): 
+  OperationProfiler(
+    options,
+    library::OperationKind::kSymm,
+    {
+      {ArgumentTypeID::kEnumerated, {"symm_kind"}, "Variant of Symm (universal)"},
+      {ArgumentTypeID::kInteger, {"m", "problem-size::m"}, "M dimension of the Symm problem space"},
+      {ArgumentTypeID::kInteger, {"n", "problem-size::n"}, "N dimension of the Symm problem space"},
+      {ArgumentTypeID::kTensor, {"A"}, "Tensor storing the A operand"},
+      {ArgumentTypeID::kTensor, {"B"}, "Tensor storing the B operand"},
+      {ArgumentTypeID::kTensor, {"C"}, "Tensor storing the C operand"},
+      {ArgumentTypeID::kEnumerated, {"side_mode"}, "Side Mode for Symm kernel (left or right)"},
+      {ArgumentTypeID::kEnumerated, {"fill_mode"}, "Fill Mode for Symm kernel (lower or upper)"},
+      {ArgumentTypeID::kEnumerated, {"blas_mode"}, "Blas Mode for Symm kernel (symmetric or hermitian)"},
+      {ArgumentTypeID::kScalar, {"alpha", "epilogue::alpha"}, "Epilogue scalar alpha"},
+      {ArgumentTypeID::kScalar, {"beta", "epilogue::beta"}, "Epilogue scalar beta"},
+      {ArgumentTypeID::kInteger, {"split_k_slices", "split-k-slices"}, "Number of partitions of K dimension"},
+      {ArgumentTypeID::kInteger, {"batch_count", "batch-count"}, "Number of Symm computed in one batch"},
+    },
+    { library::Provider::kCUBLAS }
+  ) {
+  description_ = "      Symmetric Matrix-Matrix Multiplication. D = alpha * A * B OR alpha * B * A + beta * C (where A is symmetric/hermitian)";
+}
+
+/// Destructor
+SymmOperationProfiler::~SymmOperationProfiler() {
+
+}
+
+/// Prints usage statement for the math function
+void SymmOperationProfiler::print_usage(std::ostream &out) const {
+  out << "Symm" << "\n\n";
+
+  OperationProfiler::print_usage(out);
+}
+
+/// Prints examples
+void SymmOperationProfiler::print_examples(std::ostream &out) const {
+
+  out << "\nExamples:\n\n"
+    << "Profile a particular problem size SYMM kernel:\n"
+    << "  $ cutlass_profiler --operation=Symm --blas_mode=symmetric --m=1024 --n=128\n\n"
+    
+    << "Profile a particular problem size HEMM kernel:\n"
+    << "  $ cutlass_profiler --operation=Symm --blas_mode=hermitian --m=1024 --n=128\n\n"
+
+    << "Schmoo over problem size and beta:\n"
+    << "  $ cutlass_profiler --operation=Symm --m=1024:4096:256 --n=128:8192:128 --beta=0,1,2.5\n\n"
+
+    << "Schmoo over accumulator types:\n"
+    << "  $ cutlass_profiler --operation=Symm --accumulator-type=f16,f32\n\n"
+
+    << "Schmoo over side modees:\n"
+    << "  $ cutlass_profiler --operation=Symm --side_mode=left/right\n\n"
+
+    << "Schmoo over fill modees:\n"
+    << "  $ cutlass_profiler --operation=Symm --fill_mode=lower/upper\n\n"
+
+    << "Run when A is f16 with column-major or A is any datatype with row-major (For column major, use column, col, or n. For row major use, row or t):\n"
+    << "  $ cutlass_profiler --operation=Symm --A=f16:column or --A=*:row\n\n"
+
+    << "Using various input value distribution:\n"
+    << "  $ cutlass_profiler --operation=Symm --dist=uniform,min:0,max:3\n"
+    << "  $ cutlass_profiler --operation=Symm --dist=gaussian,mean:0,stddev:3\n"
+    << "  $ cutlass_profiler --operation=Symm --dist=sequential,start:0,delta:1\n\n"
+
+    << "Run a kernel with cta tile size of 256x128x32 and save workspace if results are incorrect (note that --cta-tile::k=32 is default cta-tile size):\n"
+    << " $ cutlass_profiler --operation=Symm --cta_m=256 --cta_n=128  --cta_k=32 --save-workspace=incorrect\n\n"
+    
+    << "Test your changes to symm kernels with a quick functional test and save results in functional-test.csv:\n"
+    << " $ cutlass_profiler  --operation=Symm \\ \n"
+    << "   --m=8,56,120,136,256,264,512,520,1024,1032,4096,8192,16384 \\ \n"
+    << "   --n=8,16,32,64,128,256,288,384,504,512,520 \\ \n"
+    << "   --beta=0,1,2 --profiling-iterations=1 \\ \n"
+    << "   --providers=cutlass --output=functional-test.csv\n\n";
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if 0
+// used this for debugging
+static std::string byte_string(std::vector<uint8_t> const &bytes) {
+  std::stringstream ss;
+
+  ss << "0x";
+
+  for (size_t idx = bytes.size(); idx > 0; --idx) {
+    ss << std::hex << std::setw(2) << std::setfill('0') << uint32_t(bytes.at(idx - 1));
+  }
+
+  return ss.str();
+}
+#endif
+
+Status SymmOperationProfiler::SymmProblem::parse(
+  library::SymmDescription const &operation_desc,
+  ProblemSpace const &problem_space,
+  ProblemSpace::Problem const &problem) {
+  
+  if (!arg_as_int(this->m, "m", problem_space, problem)) {
+    // default value
+    this->m = 1024;
+  }
+  
+  if (!arg_as_int(this->n, "n", problem_space, problem)) {
+    // default value
+    this->n = 1024;
+  }
+  
+  if (!arg_as_int(this->split_k_slices, "split_k_slices", problem_space, problem)) {
+    // default value
+    this->split_k_slices = 1;
+  }
+  
+  if (!arg_as_int(this->batch_count, "batch_count", problem_space, problem)) {
+    // default value
+    this->batch_count = 1;
+  }
+
+  if (this->split_k_slices > 1 && this->batch_count > 1) {
+    // At least one of these must be one
+    return Status::kErrorInvalidProblem;
+  }
+
+  if (!tensor_description_satisfies(operation_desc.A, "A", problem_space, problem)) {
+    return Status::kErrorInvalidProblem;
+  }
+
+  if (!tensor_description_satisfies(operation_desc.B, "B", problem_space, problem)) {
+    return Status::kErrorInvalidProblem;
+  }
+
+  if (!tensor_description_satisfies(operation_desc.C, "C", problem_space, problem)) {
+    return Status::kErrorInvalidProblem;
+  }
+
+  if (!arg_as_scalar(
+    this->alpha, 
+    operation_desc.element_epilogue, 
+    "alpha", 
+    problem_space, 
+    problem)) {
+
+    if (!cast_from_double(this->alpha, operation_desc.element_epilogue, 1)) {
+      return Status::kErrorInternal;
+    }
+  }
+  
+  if (!arg_as_scalar(
+    this->beta, 
+    operation_desc.element_epilogue, 
+    "beta", 
+    problem_space, 
+    problem)) {
+    
+    if (!cast_from_double(this->beta, operation_desc.element_epilogue, 0)) {
+      return Status::kErrorInternal;
+    }
+  }
+  
+  if (operation_desc.side_mode == SideMode::kLeft) {
+    this->lda = DeviceAllocation::get_packed_layout(
+      operation_desc.A.layout, {int(this->m), int(this->m)}).front();
+  }
+  else if (operation_desc.side_mode == SideMode::kRight) {
+    this->lda = DeviceAllocation::get_packed_layout(
+      operation_desc.A.layout, {int(this->n), int(this->n)}).front();
+  }
+
+  this->ldb = DeviceAllocation::get_packed_layout(
+    operation_desc.B.layout, {int(this->m), int(this->n)}).front();
+
+  this->ldc = DeviceAllocation::get_packed_layout(
+    operation_desc.C.layout, {int(this->m), int(this->n)}).front();
+
+  return Status::kSuccess;
+}
+
+/// Total number of bytes loaded
+int64_t SymmOperationProfiler::SymmProblem::bytes(library::SymmDescription const &operation_desc) const {
+  int64_t bytes;
+  // Input bytes read and Output bytes written for the gemm problem
+  // Half matrix including the diagonal will have (X*(X+1))/2 elements
+  if (operation_desc.side_mode == SideMode::kLeft) {
+    bytes =
+      int64_t(library::sizeof_bits(operation_desc.A.element) * m / 8) * (m + 1) / 2 +
+      int64_t(library::sizeof_bits(operation_desc.B.element) * m / 8) * n + 
+      int64_t(library::sizeof_bits(operation_desc.C.element) * m / 8) * n;
+  } else if (operation_desc.side_mode == SideMode::kRight) {
+    bytes =
+      int64_t(library::sizeof_bits(operation_desc.A.element) * n / 8) * (n + 1) / 2 +
+      int64_t(library::sizeof_bits(operation_desc.B.element) * m / 8) * n + 
+      int64_t(library::sizeof_bits(operation_desc.C.element) * m / 8) * n;
+  }
+  // Set is_beta_zero true if beta is zero
+  bool is_beta_zero = std::all_of(beta.begin(), beta.end(), [](uint8_t i) { return i==0; });
+
+  // Output bytes read for the gemm problem for non-zero beta values
+  if (!is_beta_zero) {
+    bytes += int64_t(library::sizeof_bits(operation_desc.C.element) * m / 8) * n;
+  }
+
+  bytes *= batch_count;
+
+  return bytes;
+}
+
+/// Total number of flops computed
+int64_t SymmOperationProfiler::SymmProblem::flops(library::SymmDescription const &operation_desc) const {
+
+  // FLOPs for first TRMM kernel (with diagonal) = 2 * [ ( M * (M+1)/2 * N ) ] // Beta is zero
+  // FLOPs for second TRMM kernel (with diagonal) = 2 * [ ( M * (M-1)/2 * N ) ] // Beta is zero
+  // FLOPs = m*(m+1)*n [mma1] + m*(m-1)*n [mma2] + 2*m*n [epilogue]
+  // FLOPs = 2*m*n(m+1) for left side mode
+  // FLOPs can also be calculated to be same as GEMM with correct value for 'k' as below.
+  int64_t k = (operation_desc.side_mode == SideMode::kLeft) ? int64_t(m) : int64_t(n);
+  int64_t flops_ = (int64_t(m) * n * k + m * n) * 2;
+
+  // complex-valued support
+  switch (operation_desc.tile_description.math_instruction.math_operation) {
+  case library::MathOperationID::kMultiplyAddComplex:
+    flops_ *= 4;
+    break;
+    
+  case library::MathOperationID::kMultiplyAddComplexFastF32:
+    flops_ *= 4;
+    break;
+
+  case library::MathOperationID::kMultiplyAddGaussianComplex:
+    flops_ *= 3;
+    break;
+
+  default: break;
+  }
+
+  return flops_;
+}
+
+/// Initializes a performance result
+void SymmOperationProfiler::SymmProblem::initialize_result(
+  PerformanceResult &result,
+  library::SymmDescription const &operation_desc,
+  ProblemSpace const &problem_space) {
+
+  result.arguments.resize(problem_space.rank());
+
+  set_argument(result, "symm_kind", problem_space, library::to_string(operation_desc.symm_kind));
+
+  set_argument(result, "A", problem_space,
+    std::string(library::to_string(operation_desc.A.element)) + ":" + library::to_string(operation_desc.A.layout));
+
+  set_argument(result, "B", problem_space,
+    std::string(library::to_string(operation_desc.B.element)) + ":" + library::to_string(operation_desc.B.layout));
+
+  set_argument(result, "C", problem_space,
+    std::string(library::to_string(operation_desc.C.element)) + ":" + library::to_string(operation_desc.C.layout));
+
+  set_argument(result, "side_mode", problem_space, library::to_string(operation_desc.side_mode));
+
+  set_argument(result, "fill_mode", problem_space, library::to_string(operation_desc.fill_mode));
+
+  set_argument(result, "blas_mode", problem_space, library::to_string(operation_desc.blas_mode));
+
+  set_argument(result, "m", problem_space, m);
+  set_argument(result, "n", problem_space, n);
+
+  set_argument(result, "split_k_slices", problem_space, split_k_slices);
+  set_argument(result, "batch_count", problem_space, batch_count);
+
+  set_argument(result, "alpha", problem_space,
+    library::lexical_cast(alpha, operation_desc.element_epilogue));
+
+  set_argument(result, "beta", problem_space,
+    library::lexical_cast(beta, operation_desc.element_epilogue));
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Extracts the problem dimensions
+Status SymmOperationProfiler::initialize_configuration(
+  Options const &options,  
+  PerformanceReport &report,
+  DeviceContext &device_context,
+  library::Operation const *operation,
+  ProblemSpace const &problem_space,
+  ProblemSpace::Problem const &problem) {
+
+  library::SymmDescription const &operation_desc = 
+    static_cast<library::SymmDescription const &>(operation->description());
+
+  if (operation_desc.symm_kind != library::SymmKind::kUniversal) {
+    return Status::kErrorInvalidProblem;
+  }
+
+  Status status = problem_.parse(operation_desc, problem_space, problem);
+  
+  if (status != Status::kSuccess) {
+    return status;
+  }
+
+  symm_workspace_.configuration.problem_size.m() = int(problem_.m);
+  symm_workspace_.configuration.problem_size.n() = int(problem_.n);
+  symm_workspace_.configuration.problem_size.k() = (operation_desc.side_mode == SideMode::kLeft) 
+                                                    ? int(problem_.m) : int(problem_.n);
+  symm_workspace_.configuration.lda = problem_.lda;
+  symm_workspace_.configuration.ldb = problem_.ldb;
+  symm_workspace_.configuration.ldc = problem_.ldc;
+  symm_workspace_.configuration.ldd = problem_.ldc;
+  //symm_workspace_.configuration.split_k_slices = int(problem_.split_k_slices);
+  symm_workspace_.configuration.batch_count = int(problem_.split_k_slices);
+
+  symm_workspace_.arguments.A = nullptr;
+  symm_workspace_.arguments.B = nullptr;
+  symm_workspace_.arguments.C = nullptr;
+  symm_workspace_.arguments.D = nullptr;
+  symm_workspace_.arguments.alpha = problem_.alpha.data();
+  symm_workspace_.arguments.beta = problem_.beta.data();
+  symm_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
+
+  initialize_result_(this->model_result_, options, operation_desc, problem_space);
+  
+  return operation->can_implement(&symm_workspace_.configuration, &symm_workspace_.arguments);
+}
+
+/// Initializes the performance result
+void SymmOperationProfiler::initialize_result_(
+  PerformanceResult &result,
+  Options const &options,  
+  library::SymmDescription const &operation_desc,
+  ProblemSpace const &problem_space) {
+
+  result.provider = library::Provider::kCUTLASS;
+  result.disposition = Disposition::kNotRun;
+  result.status = Status::kSuccess;
+  result.operation_name = operation_desc.name;
+  
+  problem_.initialize_result(result, operation_desc, problem_space);
+
+  OperationProfiler::initialize_result_(result, operation_desc, problem_space);
+
+
+  result.bytes = problem_.bytes(operation_desc);
+  result.flops = problem_.flops(operation_desc);
+  result.runtime = 0;
+
+
+}
+
+/// Initializes workspace
+Status SymmOperationProfiler::initialize_workspace(
+  Options const &options,  
+  PerformanceReport &report,
+  DeviceContext &device_context,
+  library::Operation const *operation,
+  ProblemSpace const &problem_space,
+  ProblemSpace::Problem const &problem) {
+  
+  library::SymmDescription const &operation_desc = 
+    static_cast<library::SymmDescription const &>(operation->description());
+
+  if (options.execution_mode != ExecutionMode::kDryRun) {
+
+    if (operation_desc.side_mode == SideMode::kLeft) {
+      symm_workspace_.A = device_context.allocate_tensor(
+        options,
+        "A",
+        operation_desc.A.element,
+        operation_desc.A.layout,
+        {int(problem_.m), int(problem_.m)},
+        {int(problem_.lda)},
+        1 // batch_count = 1, default
+      );
+    } else if (operation_desc.side_mode == SideMode::kRight) {
+      symm_workspace_.A = device_context.allocate_tensor(
+        options,
+        "A",
+        operation_desc.A.element,
+        operation_desc.A.layout,
+        {int(problem_.n), int(problem_.n)},
+        {int(problem_.lda)},
+        1 // batch_count = 1, default
+      );
+    }
+
+    symm_workspace_.B = device_context.allocate_tensor(
+      options,
+      "B",
+      operation_desc.B.element,
+      operation_desc.B.layout,
+      {int(problem_.m), int(problem_.n)},
+      {int(problem_.ldb)}
+    );
+
+    symm_workspace_.C = device_context.allocate_tensor(
+      options,
+      "C",
+      operation_desc.C.element,
+      operation_desc.C.layout,
+      {int(problem_.m), int(problem_.n)},
+      {int(problem_.ldc)},
+      1 // batch_count = 1, default
+    );
+
+    symm_workspace_.Computed = device_context.allocate_tensor(
+      "D",
+      operation_desc.C.element,
+      operation_desc.C.layout,
+      {int(problem_.m), int(problem_.n)},
+      {int(problem_.ldc)}
+    );
+
+    symm_workspace_.Reference = device_context.allocate_tensor(
+      "Reference",
+      operation_desc.C.element,
+      operation_desc.C.layout,
+      {int(problem_.m), int(problem_.n)},
+      {int(problem_.ldc)}
+    );
+
+    symm_workspace_.Computed->copy_from_device(symm_workspace_.C->data());
+    symm_workspace_.Reference->copy_from_device(symm_workspace_.C->data());
+  }
+
+
+  //
+  // Initialize the CUTLASS operation
+  //
+  Status status = Status::kSuccess;
+
+  if (options.profiling.provider_enabled(library::Provider::kCUTLASS)) {
+
+    if (options.execution_mode != ExecutionMode::kDryRun) {
+
+      uint64_t workspace_size = operation->get_host_workspace_size(&symm_workspace_.configuration);
+      symm_workspace_.host_workspace.resize(workspace_size, 0);
+
+      workspace_size = operation->get_device_workspace_size(&symm_workspace_.configuration);
+      symm_workspace_.device_workspace.reset(library::NumericTypeID::kU8, workspace_size);
+
+      status = operation->initialize(
+        &symm_workspace_.configuration,
+        symm_workspace_.host_workspace.data(),
+        symm_workspace_.device_workspace.data());
+    }
+
+    //
+    // If CUTLASS is enabled, generate a result for it
+    //
+    results_.push_back(model_result_);
+    results_.back().provider = library::Provider::kCUTLASS;
+    results_.back().op_kind = library::OperationKind::kSymm;
+    results_.back().disposition = Disposition::kNotRun;
+
+    for(auto provider : verification_providers_) {
+      results_.back().verification_map[provider] = Disposition::kNotRun;
+    }
+  }
+
+  return status;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Verifies CUTLASS against references
+bool SymmOperationProfiler::verify_cutlass(
+  Options const &options,  
+  PerformanceReport &report,
+  DeviceContext &device_context,
+  library::Operation const *operation,
+  ProblemSpace const &problem_space,
+  ProblemSpace::Problem const &problem) {
+
+  if (!options.profiling.provider_enabled(library::Provider::kCUTLASS)) {
+    return true;
+  }
+
+  if (options.execution_mode == ExecutionMode::kDryRun) {
+    return true;
+  }
+
+  // Initialize structure containing Symm arguments
+  symm_workspace_.arguments.A = symm_workspace_.A->data();
+  symm_workspace_.arguments.B = symm_workspace_.B->data();
+  symm_workspace_.arguments.C = symm_workspace_.C->data();
+  symm_workspace_.arguments.D = symm_workspace_.Computed->data();
+  symm_workspace_.arguments.alpha = problem_.alpha.data();
+  symm_workspace_.arguments.beta = problem_.beta.data();
+  symm_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
+
+  //
+  // Run the CUTLASS operation
+  //
+
+  results_.back().status = operation->run(
+    &symm_workspace_.arguments, 
+    symm_workspace_.host_workspace.data(),
+    symm_workspace_.device_workspace.data());
+
+  if (results_.back().status != Status::kSuccess) {
+    results_.back().disposition = Disposition::kFailed;
+    return false;
+  }
+
+  cudaError_t result = cudaDeviceSynchronize();
+  if (result != cudaSuccess) {
+    results_.back().disposition = Disposition::kFailed;
+    return false;
+  }
+
+  // CUTLASS op ran the but not yet verified against any verification provider
+  results_.back().disposition = Disposition::kNotVerified;
+
+  //
+  // Run verification providers
+  //
+
+  if (options.verification.enabled) {
+
+#if CUTLASS_ENABLE_CUBLAS
+    if (options.verification.provider_enabled(library::Provider::kCUBLAS)) {
+
+      // Guard against unsupported cases
+      auto const & symm_desc = static_cast<library::SymmDescription const &>(operation->description());
+
+      if (cublas_satisfies(symm_desc) == Status::kSuccess) {
+
+        // call cublas verification if supported
+        verify_with_cublas_(
+          options,
+          report,
+          device_context,
+          operation,
+          problem_space,
+          problem);
+        }
+
+      else {
+        // set verification map for cublas to not supported
+        results_.back().verification_map[library::Provider::kCUBLAS] = Disposition::kNotSupported;
+      }
+    }
+#endif // #if CUTLASS_ENABLE_CUBLAS
+    
+    // Update disposition to worst case verification outcome among all 
+    // verification providers which are supported
+    bool is_any_verification_run_passed = false;
+    for(auto &m : results_.back().verification_map) {
+      if(m.second == Disposition::kFailed || m.second == Disposition::kIncorrect) {
+        results_.back().disposition = m.second;
+        return true;
+      }
+      if(!is_any_verification_run_passed && m.second == Disposition::kPassed) {
+        is_any_verification_run_passed = true;
+      }
+    }
+
+    if(is_any_verification_run_passed) {
+      results_.back().disposition = Disposition::kPassed;
+    }
+  }
+
+  // Return true means continue profiling
+  return true;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Verifies CUTLASS against references
+bool SymmOperationProfiler::verify_with_cublas_(
+  Options const &options,  
+  PerformanceReport &report,
+  DeviceContext &device_context,
+  library::Operation const *operation,
+  ProblemSpace const &problem_space,
+  ProblemSpace::Problem const &problem) {
+
+
+#if CUTLASS_ENABLE_CUBLAS
+
+  library::SymmDescription const &symm_desc = 
+    static_cast<library::SymmDescription const &>(operation->description());
+
+  //
+  // Construct cuBLAS operators
+  //
+    
+  CublasCreate handle;
+  cublasStatus_t status = handle.get_cublas_create_status();
+
+  if (status != CUBLAS_STATUS_SUCCESS) {
+
+    results_.back().verification_map[library::Provider::kCUBLAS] = Disposition::kFailed;
+    return true;
+  }
+
+  //
+  // Initialize state
+  //
+
+  try {
+
+    //
+    // Construct dispatcher to cublas<t>Symm()
+    //
+
+    // Initialize structure containing Symm arguments
+    symm_workspace_.arguments.A = symm_workspace_.A->data();
+    symm_workspace_.arguments.B = symm_workspace_.B->data();
+    symm_workspace_.arguments.C = symm_workspace_.Reference->data();
+    symm_workspace_.arguments.D = symm_workspace_.Reference->data();
+    symm_workspace_.arguments.alpha = problem_.alpha.data();
+    symm_workspace_.arguments.beta = problem_.beta.data();
+    symm_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
+
+    detail::cublasSymmDispatcher symm_op( 
+      symm_desc, 
+      symm_workspace_.configuration,
+      symm_workspace_.arguments
+    );
+
+    if (symm_op.status != Status::kSuccess) {
+      results_.back().verification_map[library::Provider::kCUBLAS] = Disposition::kNotRun;
+      return true;
+    }
+
+    results_.back().status = Status::kSuccess;
+
+    status = symm_op(handle);
+
+    // Handle errors
+    if (status != CUBLAS_STATUS_SUCCESS) {
+
+      results_.back().verification_map[library::Provider::kCUBLAS] = Disposition::kFailed;
+      return true;
+    }
+
+    //
+    // Verify results
+    //
+
+    results_.back().verification_map[library::Provider::kCUBLAS] = compare_tensors(
+      options,
+      *symm_workspace_.Computed,
+      *symm_workspace_.Reference
+    );
+
+    // Save workspace if incorrect
+    if (options.verification.save_workspace == SaveWorkspace::kIncorrect && 
+      results_.back().verification_map[library::Provider::kCUBLAS] == Disposition::kIncorrect) {
+
+      save_workspace(
+        device_context,
+        options,
+        symm_desc,
+        library::Provider::kCUTLASS,
+        library::Provider::kCUBLAS);
+    }
+  }
+  catch (...) {
+    results_.back().verification_map[library::Provider::kCUBLAS] = Disposition::kFailed;
+  }
+
+#endif
+
+  // Return true means continue profiling
+  return true;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Measures performance results
+bool SymmOperationProfiler::profile(
+  Options const &options,  
+  PerformanceReport &report,
+  DeviceContext &device_context,
+  library::Operation const *operation,
+  ProblemSpace const &problem_space,
+  ProblemSpace::Problem const &problem) {
+
+  if (options.profiling.provider_enabled(library::Provider::kCUTLASS)) {
+
+    // Initialize structure containing Symm arguments
+    symm_workspace_.arguments.A = symm_workspace_.A->data();
+    symm_workspace_.arguments.B = symm_workspace_.B->data();
+    symm_workspace_.arguments.C = symm_workspace_.C->data();
+    symm_workspace_.arguments.D = symm_workspace_.Computed->data();
+    symm_workspace_.arguments.alpha = problem_.alpha.data();
+    symm_workspace_.arguments.beta = problem_.beta.data();
+    symm_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
+
+    results_.back().status = profile_cutlass_(
+      results_.back().runtime,
+      options,
+      operation,
+      &symm_workspace_.arguments,
+      symm_workspace_.host_workspace.data(),
+      symm_workspace_.device_workspace.data()
+    );
+  }
+  return true;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace profiler
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tools/profiler/src/symm_operation_profiler.h b/tools/profiler/src/symm_operation_profiler.h
new file mode 100644
index 00000000..3a9efd72
--- /dev/null
+++ b/tools/profiler/src/symm_operation_profiler.h
@@ -0,0 +1,230 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Defines a math function
+
+  
+*/
+
+#pragma once
+
+#include <vector>
+#include <string>
+#include <memory>
+#include <algorithm>
+#include <unordered_map>
+
+// CUTLASS Library includes
+#include "cutlass/blas3.h"
+#include "cutlass/library/library.h"
+#include "cutlass/library/util.h"
+#include "cutlass/library/manifest.h"
+
+// Profiler includes
+#include "options.h"
+#include "device_context.h"
+#include "operation_profiler.h"
+#include "performance_result.h"
+#include "problem_space.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace profiler {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+/// Abstract base class for each math function
+class SymmOperationProfiler : public OperationProfiler {
+public:
+
+  /// Problem structure obtained from problem space
+  struct SymmProblem {
+    int64_t m;
+    int64_t n;
+    int64_t lda;
+    int64_t ldb;
+    int64_t ldc;
+    SideMode side_mode;
+    FillMode fill_mode;
+    BlasMode blas_mode;
+    std::vector<uint8_t> alpha;
+    std::vector<uint8_t> beta;
+    int64_t split_k_slices;
+    int64_t batch_count;
+
+    //
+    // Methods
+    //
+
+    SymmProblem(): 
+      m(16), n(16), lda(0), ldb(0), ldc(0), 
+      side_mode(SideMode::kInvalid), fill_mode(FillMode::kInvalid), blas_mode(BlasMode::kInvalid), 
+      split_k_slices(1), batch_count(1) { }
+
+    /// Parses the problem
+    Status parse(
+      library::SymmDescription const &operation_desc,
+      ProblemSpace const &problem_space,
+      ProblemSpace::Problem const &problem);
+
+    /// Total number of bytes loaded
+    int64_t bytes(library::SymmDescription const &operation_desc) const;
+
+    /// Total number of flops computed
+    int64_t flops(library::SymmDescription const &operation_desc) const;
+    
+    /// Initializes a performance result
+    void initialize_result(
+      PerformanceResult &result,
+      library::SymmDescription const &operation_desc,
+      ProblemSpace const &problem_space);
+  };
+
+  /// Workspace used 
+  struct SymmWorkspace {
+
+    DeviceAllocation *A;
+    DeviceAllocation *B;
+    DeviceAllocation *C;
+    DeviceAllocation *Computed;
+    DeviceAllocation *Reference;
+
+    library::SymmConfiguration configuration;
+    library::SymmArguments arguments;
+
+    /// Buffer used for the operation's host workspace
+    std::vector<uint8_t> host_workspace;
+
+    /// Buffer used for the operations' device workspace
+    DeviceAllocation device_workspace;
+
+    //
+    // Methods
+    //
+
+    SymmWorkspace(): 
+      A(nullptr), B(nullptr), C(nullptr), Computed(nullptr), Reference(nullptr) { }
+  };
+
+protected:
+
+  //
+  // Data members
+  //
+
+  /// GEMM problem obtained from problem space
+  SymmProblem problem_;
+
+  /// Device memory allocations 
+  SymmWorkspace symm_workspace_;
+
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  SymmOperationProfiler(Options const &options);
+
+  /// Destructor
+  virtual ~SymmOperationProfiler();
+
+  /// Prints usage statement for the math function
+  virtual void print_usage(std::ostream &out) const;
+
+  /// Prints examples
+  virtual void print_examples(std::ostream &out) const;
+
+  /// Extracts the problem dimensions
+  virtual Status initialize_configuration(
+    Options const &options, 
+    PerformanceReport &report, 
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Initializes workspace
+  virtual Status initialize_workspace(
+    Options const &options, 
+    PerformanceReport &report, 
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Verifies CUTLASS against references
+  virtual bool verify_cutlass(
+    Options const &options,  
+    PerformanceReport &report,
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Measures performance results
+  virtual bool profile(
+    Options const &options, 
+    PerformanceReport &report, 
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+protected:
+
+  /// Initializes the performance result
+  void initialize_result_(
+    PerformanceResult &result,
+    Options const &options,  
+    library::SymmDescription const &operation_desc,
+    ProblemSpace const &problem_space);
+
+  /// Verifies CUTLASS against references
+  bool verify_with_cublas_(
+    Options const &options,  
+    PerformanceReport &report,
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace profiler
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tools/profiler/src/trmm_operation_profiler.cu b/tools/profiler/src/trmm_operation_profiler.cu
new file mode 100644
index 00000000..45f20f32
--- /dev/null
+++ b/tools/profiler/src/trmm_operation_profiler.cu
@@ -0,0 +1,704 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Execution environment
+
+  
+*/
+
+#include <iostream>
+#include <stdexcept>
+#include <iomanip>
+#include <ios>
+
+#include "cutlass/core_io.h"
+
+#include "cublas_helpers.h"
+#include "trmm_operation_profiler.h"
+#include "gpu_timer.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace profiler {
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Ctor
+TrmmOperationProfiler::TrmmOperationProfiler(Options const &options): 
+  OperationProfiler(
+    options,
+    library::OperationKind::kTrmm,
+    {
+      {ArgumentTypeID::kEnumerated, {"trmm_kind"}, "Variant of TRMM (universal)"},
+      {ArgumentTypeID::kInteger, {"m", "problem-size::m"}, "M dimension of the TRMM problem space"},
+      {ArgumentTypeID::kInteger, {"n", "problem-size::n"}, "N dimension of the TRMM problem space"},
+      {ArgumentTypeID::kTensor, {"A"}, "Tensor storing the A operand"},
+      {ArgumentTypeID::kEnumerated, {"side_mode"}, "Side Mode for TRMM (left, right)"},
+      {ArgumentTypeID::kEnumerated, {"fill_mode"}, "Fill Mode for TRMM (lower, upper)"},
+      {ArgumentTypeID::kEnumerated, {"diag_type"}, "Diag Type for TRMM (nonunit, unit)"},
+      {ArgumentTypeID::kTensor, {"B"}, "Tensor storing the B operand"},
+      {ArgumentTypeID::kTensor, {"D"}, "Tensor storing the D operand"},
+      {ArgumentTypeID::kScalar, {"alpha", "epilogue::alpha"}, "Epilogue scalar alpha"},
+      {ArgumentTypeID::kScalar, {"beta", "epilogue::beta"}, "Epilogue scalar beta"},
+      {ArgumentTypeID::kInteger, {"split_k_slices", "split-k-slices"}, "Number of partitions of K dimension"},
+      {ArgumentTypeID::kInteger, {"batch_count", "batch-count"}, "Number of TRMMs computed in one batch"},
+    },
+    { library::Provider::kCUBLAS}
+  ) {
+  description_ = "      Triangular Matrix-Multiplication. D = alpha * A * B or alpha * B * A";
+}
+
+/// Destructor
+TrmmOperationProfiler::~TrmmOperationProfiler() {
+
+}
+
+/// Prints usage statement for the math function
+void TrmmOperationProfiler::print_usage(std::ostream &out) const {
+  out << "TRMM" << "\n\n";
+
+  OperationProfiler::print_usage(out);
+}
+
+/// Prints examples
+void TrmmOperationProfiler::print_examples(std::ostream &out) const {
+
+  out << "\nExamples:\n\n"
+    << "Profile a particular problem size:\n"
+    << "  $ cutlass_profiler --operation=Trmm --n=1024 --m=128\n\n"
+
+    << "Schmoo over problem size and beta:\n"
+    << "  $ cutlass_profiler --operation=Trmm --n=1024:4096:256 --m=128:8192:128 --beta=0,1,2.5\n\n"
+
+    << "Schmoo over accumulator types:\n"
+    << "  $ cutlass_profiler --operation=Trmm --accumulator-type=f16,f32\n\n"
+
+    << "Run when A is f16 with column-major or A is any datatype with row-major (For column major, use column, col, or n. For row major use, row or t):\n"
+    << "  $ cutlass_profiler --operation=Trmm --A=f16:column or --A=*:row\n\n"
+
+    << "Using various input value distribution:\n"
+    << "  $ cutlass_profiler --operation=Trmm --dist=uniform,min:0,max:3\n"
+    << "  $ cutlass_profiler --operation=Trmm --dist=gaussian,mean:0,stddev:3\n"
+    << "  $ cutlass_profiler --operation=Trmm --dist=sequential,start:0,delta:1\n\n"
+
+    << "Run a kernel with cta tile size of 256x128x32 and save workspace if results are incorrect (note that --cta-tile::k=32 is default cta-tile size):\n"
+    << " $ cutlass_profiler --operation=Trmm --cta_m=256 --cta_n=128  --cta_k=32 --save-workspace=incorrect\n\n"
+    
+    << "Test your changes to trmm kernels with a quick functional test and save results in functional-test.csv:\n"
+    << " $ cutlass_profiler  --operation=Trmm \\ \n"
+    << "   --n=8,56,120,136,256,264,512,520,1024,1032,4096,8192,16384 \\ \n"
+    << "   --k=8,16,32,64,128,256,288,384,504,512,520 \\ \n"
+    << "   --beta=0,1,2 --profiling-iterations=1 \\ \n"
+    << "   --providers=cutlass --output=functional-test.csv\n\n";
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if 0
+// used this for debugging
+static std::string byte_string(std::vector<uint8_t> const &bytes) {
+  std::stringstream ss;
+
+  ss << "0x";
+
+  for (size_t idx = bytes.size(); idx > 0; --idx) {
+    ss << std::hex << std::setw(2) << std::setfill('0') << uint32_t(bytes.at(idx - 1));
+  }
+
+  return ss.str();
+}
+#endif
+
+Status TrmmOperationProfiler::TrmmProblem::parse(
+  library::TrmmDescription const &operation_desc,
+  ProblemSpace const &problem_space,
+  ProblemSpace::Problem const &problem) {
+  
+  if (!arg_as_int(this->m, "m", problem_space, problem)) {
+    // default value
+    this->m = 1024;
+  }
+  
+  if (!arg_as_int(this->n, "n", problem_space, problem)) {
+    // default value
+    this->n = 1024;
+  }
+  
+  if (!arg_as_int(this->split_k_slices, "split_k_slices", problem_space, problem)) {
+    // default value
+    this->split_k_slices = 1;
+  }
+  
+  if (!arg_as_int(this->batch_count, "batch_count", problem_space, problem)) {
+    // default value
+    this->batch_count = 1;
+  }
+
+  if (this->split_k_slices > 1 && this->batch_count > 1) {
+    // At least one of these must be one
+    return Status::kErrorInvalidProblem;
+  }
+
+  if (!tensor_description_satisfies(operation_desc.A, "A", problem_space, problem)) {
+    return Status::kErrorInvalidProblem;
+  }
+
+  if (!tensor_description_satisfies(operation_desc.B, "B", problem_space, problem)) {
+    return Status::kErrorInvalidProblem;
+  }
+
+  if (!tensor_description_satisfies(operation_desc.D, "D", problem_space, problem)) {
+    return Status::kErrorInvalidProblem;
+  }
+
+  if (!arg_as_scalar(
+    this->alpha, 
+    operation_desc.element_epilogue, 
+    "alpha", 
+    problem_space, 
+    problem)) {
+
+    if (!cast_from_double(this->alpha, operation_desc.element_epilogue, 1)) {
+      return Status::kErrorInternal;
+    }
+  }
+  
+  if (!arg_as_scalar(
+    this->beta, 
+    operation_desc.element_epilogue, 
+    "beta", 
+    problem_space, 
+    problem)) {
+    
+    if (!cast_from_double(this->beta, operation_desc.element_epilogue, 0)) {
+      return Status::kErrorInternal;
+    }
+  }
+  
+  if (operation_desc.side_mode == SideMode::kLeft) {
+    this->lda = DeviceAllocation::get_packed_layout(
+      operation_desc.A.layout, {int(this->m), int(this->m)}).front();
+  }
+  else if (operation_desc.side_mode == SideMode::kRight) {
+    this->lda = DeviceAllocation::get_packed_layout(
+      operation_desc.A.layout, {int(this->n), int(this->n)}).front();
+  }
+
+  this->ldb = DeviceAllocation::get_packed_layout(
+    operation_desc.B.layout, {int(this->m), int(this->n)}).front();
+
+  this->ldd = DeviceAllocation::get_packed_layout(
+    operation_desc.D.layout, {int(this->m), int(this->n)}).front();
+
+  return Status::kSuccess;
+}
+
+/// Initializes a performance result
+void TrmmOperationProfiler::TrmmProblem::initialize_result(
+  PerformanceResult &result,
+  library::TrmmDescription const &operation_desc,
+  ProblemSpace const &problem_space) {
+
+  result.arguments.resize(problem_space.rank());
+
+  set_argument(result, "trmm_kind", problem_space, library::to_string(operation_desc.trmm_kind));
+
+  set_argument(result, "A", problem_space,
+    std::string(library::to_string(operation_desc.A.element)) + ":" + library::to_string(operation_desc.A.layout));
+
+  set_argument(result, "side_mode", problem_space, library::to_string(operation_desc.side_mode));
+
+  set_argument(result, "fill_mode", problem_space, library::to_string(operation_desc.fill_mode));
+
+  set_argument(result, "diag_type", problem_space, library::to_string(operation_desc.diag_type));
+
+  set_argument(result, "B", problem_space,
+    std::string(library::to_string(operation_desc.B.element)) + ":" + library::to_string(operation_desc.B.layout));
+
+  set_argument(result, "D", problem_space,
+    std::string(library::to_string(operation_desc.D.element)) + ":" + library::to_string(operation_desc.D.layout));
+
+  set_argument(result, "m", problem_space, m);
+  set_argument(result, "n", problem_space, n);
+
+  set_argument(result, "split_k_slices", problem_space, split_k_slices);
+  set_argument(result, "batch_count", problem_space, batch_count);
+
+  set_argument(result, "alpha", problem_space,
+    library::lexical_cast(alpha, operation_desc.element_epilogue));
+
+  set_argument(result, "beta", problem_space,
+    library::lexical_cast(beta, operation_desc.element_epilogue));
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Extracts the problem dimensions
+Status TrmmOperationProfiler::initialize_configuration(
+  Options const &options,  
+  PerformanceReport &report,
+  DeviceContext &device_context,
+  library::Operation const *operation,
+  ProblemSpace const &problem_space,
+  ProblemSpace::Problem const &problem) {
+
+  library::TrmmDescription const &operation_desc = 
+    static_cast<library::TrmmDescription const &>(operation->description());
+
+  if (operation_desc.trmm_kind != library::TrmmKind::kUniversal) {
+    return Status::kErrorInvalidProblem;
+  }
+
+  Status status = problem_.parse(operation_desc, problem_space, problem);
+  
+  if (status != Status::kSuccess) {
+    return status;
+  }
+
+  trmm_workspace_.configuration.problem_size.m() = int(problem_.m);
+  trmm_workspace_.configuration.problem_size.n() = int(problem_.n);
+  trmm_workspace_.configuration.problem_size.k() = (operation_desc.side_mode == SideMode::kLeft) 
+                                                    ? int(problem_.m) : int(problem_.n);
+  trmm_workspace_.configuration.lda = problem_.lda;
+  trmm_workspace_.configuration.ldb = problem_.ldb;
+  trmm_workspace_.configuration.ldd = problem_.ldd;
+  //trmm_workspace_.configuration.split_k_slices = int(problem_.split_k_slices);
+  trmm_workspace_.configuration.batch_count = int(problem_.split_k_slices);
+
+  trmm_workspace_.arguments.A = nullptr;
+  trmm_workspace_.arguments.B = nullptr;
+  trmm_workspace_.arguments.D = nullptr;
+  trmm_workspace_.arguments.alpha = problem_.alpha.data();
+  trmm_workspace_.arguments.beta = problem_.beta.data();
+  trmm_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
+
+  initialize_result_(this->model_result_, options, operation_desc, problem_space);
+  
+  return operation->can_implement(&trmm_workspace_.configuration, &trmm_workspace_.arguments);
+}
+
+/// Initializes the performance result
+void TrmmOperationProfiler::initialize_result_(
+  PerformanceResult &result,
+  Options const &options,  
+  library::TrmmDescription const &operation_desc,
+  ProblemSpace const &problem_space) {
+
+  result.provider = library::Provider::kCUTLASS;
+  result.disposition = Disposition::kNotRun;
+  result.status = Status::kSuccess;
+  result.operation_name = operation_desc.name;
+  
+  problem_.initialize_result(result, operation_desc, problem_space);
+
+  OperationProfiler::initialize_result_(result, operation_desc, problem_space);
+
+  if (operation_desc.side_mode == SideMode::kLeft) {
+    // Input bytes read and Output bytes written for the trmm problem
+    result.bytes = 
+      // Half matrix including the diagonal will have (M*(M+1))/2 elements
+      int64_t(library::sizeof_bits(operation_desc.A.element) * problem_.m / 8) * (problem_.m + 1) / 2 +
+      int64_t(library::sizeof_bits(operation_desc.B.element) * problem_.m / 8) * problem_.n + 
+      int64_t(library::sizeof_bits(operation_desc.D.element) * problem_.m / 8) * problem_.n;
+  } else if (operation_desc.side_mode == SideMode::kRight) {
+    // Input bytes read and Output bytes written for the trmm problem
+    result.bytes = 
+      // Half matrix including the diagonal will have (N*(N+1))/2 elements
+      int64_t(library::sizeof_bits(operation_desc.A.element) * problem_.n / 8) * (problem_.n + 1) / 2 +
+      int64_t(library::sizeof_bits(operation_desc.B.element) * problem_.m / 8) * problem_.n + 
+      int64_t(library::sizeof_bits(operation_desc.D.element) * problem_.m / 8) * problem_.n;
+  }
+
+  // FLOPs = 2 * [ ( M * (M+1)/2 * N ) ] // Beta is zero
+  result.flops = problem_.m * (problem_.m + 1) * problem_.n;
+ 
+   result.runtime = 0;
+
+  // complex-valued support
+  switch (operation_desc.tile_description.math_instruction.math_operation) {
+  case library::MathOperationID::kMultiplyAddComplex:
+    result.flops *= 4;
+    break;
+    
+  case library::MathOperationID::kMultiplyAddComplexFastF32:
+    result.flops *= 4;
+    break;
+ 
+  default: break;
+  }
+
+}
+
+/// Initializes workspace
+Status TrmmOperationProfiler::initialize_workspace(
+  Options const &options,  
+  PerformanceReport &report,
+  DeviceContext &device_context,
+  library::Operation const *operation,
+  ProblemSpace const &problem_space,
+  ProblemSpace::Problem const &problem) {
+  
+  library::TrmmDescription const &operation_desc = 
+    static_cast<library::TrmmDescription const &>(operation->description());
+
+  if (options.execution_mode != ExecutionMode::kDryRun) {
+
+    if (operation_desc.side_mode == SideMode::kLeft) {
+      trmm_workspace_.A = device_context.allocate_tensor(
+        options,
+        "A",
+        operation_desc.A.element,
+        operation_desc.A.layout,
+        {int(problem_.m), int(problem_.m)},
+        {int(problem_.lda)},
+        1 // batch_count = 1, default
+      );
+    } else if (operation_desc.side_mode == SideMode::kRight) {
+      trmm_workspace_.A = device_context.allocate_tensor(
+        options,
+        "A",
+        operation_desc.A.element,
+        operation_desc.A.layout,
+        {int(problem_.n), int(problem_.n)},
+        {int(problem_.lda)},
+        1 // batch_count = 1, default
+      );
+    }
+
+    trmm_workspace_.B = device_context.allocate_tensor(
+      options,
+      "B",
+      operation_desc.B.element,
+      operation_desc.B.layout,
+      {int(problem_.m), int(problem_.n)},
+      {int(problem_.ldb)}
+    );
+
+    trmm_workspace_.Computed = device_context.allocate_tensor(
+      "D",
+      operation_desc.D.element,
+      operation_desc.D.layout,
+      {int(problem_.m), int(problem_.n)},
+      {int(problem_.ldd)}
+    );
+
+    trmm_workspace_.Reference = device_context.allocate_tensor(
+      "Reference",
+      operation_desc.D.element,
+      operation_desc.D.layout,
+      {int(problem_.m), int(problem_.n)},
+      {int(problem_.ldd)}
+    );
+
+  }
+
+  //
+  // Initialize the CUTLASS operation
+  //
+  Status status = Status::kSuccess;
+
+  if (options.profiling.provider_enabled(library::Provider::kCUTLASS)) {
+
+    if (options.execution_mode != ExecutionMode::kDryRun) {
+
+      uint64_t workspace_size = operation->get_host_workspace_size(&trmm_workspace_.configuration);
+      trmm_workspace_.host_workspace.resize(workspace_size, 0);
+
+      workspace_size = operation->get_device_workspace_size(&trmm_workspace_.configuration);
+      trmm_workspace_.device_workspace.reset(library::NumericTypeID::kU8, workspace_size);
+
+      status = operation->initialize(
+        &trmm_workspace_.configuration,
+        trmm_workspace_.host_workspace.data(),
+        trmm_workspace_.device_workspace.data());
+    }
+
+    //
+    // If CUTLASS is enabled, generate a result for it
+    //
+    results_.push_back(model_result_);
+    results_.back().provider = library::Provider::kCUTLASS;
+    results_.back().op_kind = library::OperationKind::kTrmm;
+    results_.back().disposition = Disposition::kNotRun;
+
+    for(auto provider : verification_providers_) {
+      results_.back().verification_map[provider] = Disposition::kNotRun;
+    }
+  }
+
+  return status;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Verifies CUTLASS against references
+bool TrmmOperationProfiler::verify_cutlass(
+  Options const &options,  
+  PerformanceReport &report,
+  DeviceContext &device_context,
+  library::Operation const *operation,
+  ProblemSpace const &problem_space,
+  ProblemSpace::Problem const &problem) {
+
+  if (!options.profiling.provider_enabled(library::Provider::kCUTLASS)) {
+    return true;
+  }
+
+  if (options.execution_mode == ExecutionMode::kDryRun) {
+    return true;
+  }
+
+  // Initialize structure containing TRMM arguments
+  trmm_workspace_.arguments.A = trmm_workspace_.A->data();
+  trmm_workspace_.arguments.B = trmm_workspace_.B->data();
+  trmm_workspace_.arguments.D = trmm_workspace_.Computed->data();
+  trmm_workspace_.arguments.alpha = problem_.alpha.data();
+  trmm_workspace_.arguments.beta = problem_.beta.data();
+  trmm_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
+
+  //
+  // Run the CUTLASS operation
+  //
+
+  results_.back().status = operation->run(
+    &trmm_workspace_.arguments, 
+    trmm_workspace_.host_workspace.data(),
+    trmm_workspace_.device_workspace.data());
+
+  if (results_.back().status != Status::kSuccess) {
+    results_.back().disposition = Disposition::kFailed;
+    return false;
+  }
+
+  cudaError_t result = cudaDeviceSynchronize();
+  if (result != cudaSuccess) {
+    results_.back().disposition = Disposition::kFailed;
+    return false;
+  }
+
+  // CUTLASS op ran the but not yet verified against any verification provider
+  results_.back().disposition = Disposition::kNotVerified;
+
+  //
+  // Run verification providers
+  //
+
+  if (options.verification.enabled) {
+
+#if CUTLASS_ENABLE_CUBLAS
+    if (options.verification.provider_enabled(library::Provider::kCUBLAS)) {
+
+      // Guard against unsupported cases
+      auto const & trmm_desc = static_cast<library::TrmmDescription const &>(operation->description());
+
+      if (cublas_satisfies(trmm_desc) == Status::kSuccess) {
+
+        // call cublas verification if supported
+        verify_with_cublas_(
+          options,
+          report,
+          device_context,
+          operation,
+          problem_space,
+          problem);
+        }
+
+      else {
+        // set verification map for cublas to not supported
+        results_.back().verification_map[library::Provider::kCUBLAS] = Disposition::kNotSupported;
+      }
+    }
+#endif // #if CUTLASS_ENABLE_CUBLAS
+    
+    // Update disposition to worst case verification outcome among all 
+    // verification providers which are supported
+    bool is_any_verification_run_passed = false;
+    for(auto &m : results_.back().verification_map) {
+      if(m.second == Disposition::kFailed || m.second == Disposition::kIncorrect) {
+        results_.back().disposition = m.second;
+        return true;
+      }
+      if(!is_any_verification_run_passed && m.second == Disposition::kPassed) {
+        is_any_verification_run_passed = true;
+      }
+    }
+
+    if(is_any_verification_run_passed) {
+      results_.back().disposition = Disposition::kPassed;
+    }
+  }
+
+  // Return true means continue profiling
+  return true;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Verifies CUTLASS against references
+bool TrmmOperationProfiler::verify_with_cublas_(
+  Options const &options,  
+  PerformanceReport &report,
+  DeviceContext &device_context,
+  library::Operation const *operation,
+  ProblemSpace const &problem_space,
+  ProblemSpace::Problem const &problem) {
+
+
+#if CUTLASS_ENABLE_CUBLAS
+
+  library::TrmmDescription const &trmm_desc = 
+    static_cast<library::TrmmDescription const &>(operation->description());
+
+  //
+  // Construct cuBLAS operators
+  //
+    
+  CublasCreate handle;
+  cublasStatus_t status = handle.get_cublas_create_status();
+
+  if (status != CUBLAS_STATUS_SUCCESS) {
+
+    results_.back().verification_map[library::Provider::kCUBLAS] = Disposition::kFailed;
+    return true;
+  }
+
+  //
+  // Initialize state
+  //
+
+  try {
+
+    //
+    // Construct dispatcher to cublas<t>Trmm()
+    //
+
+    // Initialize structure containing TRMM arguments
+    trmm_workspace_.arguments.A = trmm_workspace_.A->data();
+    trmm_workspace_.arguments.B = trmm_workspace_.B->data();
+    trmm_workspace_.arguments.D = trmm_workspace_.Reference->data();
+    trmm_workspace_.arguments.alpha = problem_.alpha.data();
+    trmm_workspace_.arguments.beta = problem_.beta.data();
+    trmm_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
+
+    detail::cublasTrmmDispatcher trmm_op( 
+      trmm_desc, 
+      trmm_workspace_.configuration,
+      trmm_workspace_.arguments
+    );
+
+    if (trmm_op.status != Status::kSuccess) {
+      results_.back().verification_map[library::Provider::kCUBLAS] = Disposition::kNotRun;
+      return true;
+    }
+
+    results_.back().status = Status::kSuccess;
+
+    status = trmm_op(handle);
+
+    // Handle errors
+    if (status != CUBLAS_STATUS_SUCCESS) {
+
+      results_.back().verification_map[library::Provider::kCUBLAS] = Disposition::kFailed;
+      return true;
+    }
+
+    //
+    // Verify results
+    //
+    results_.back().verification_map[library::Provider::kCUBLAS] = compare_tensors(
+      options,
+      *trmm_workspace_.Computed,
+      *trmm_workspace_.Reference
+    );
+
+    // Save workspace if incorrect
+    if (options.verification.save_workspace == SaveWorkspace::kIncorrect && 
+      results_.back().verification_map[library::Provider::kCUBLAS] == Disposition::kIncorrect) {
+
+      save_workspace(
+        device_context,
+        options,
+        trmm_desc,
+        library::Provider::kCUTLASS,
+        library::Provider::kCUBLAS);
+    }
+  }
+  catch (...) {
+    results_.back().verification_map[library::Provider::kCUBLAS] = Disposition::kFailed;
+  }
+
+#endif
+
+  // Return true means continue profiling
+  return true;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Measures performance results
+bool TrmmOperationProfiler::profile(
+  Options const &options,  
+  PerformanceReport &report,
+  DeviceContext &device_context,
+  library::Operation const *operation,
+  ProblemSpace const &problem_space,
+  ProblemSpace::Problem const &problem) {
+
+  if (options.profiling.provider_enabled(library::Provider::kCUTLASS)) {
+
+    // Initialize structure containing TRMM arguments
+    trmm_workspace_.arguments.A = trmm_workspace_.A->data();
+    trmm_workspace_.arguments.B = trmm_workspace_.B->data();
+    trmm_workspace_.arguments.D = trmm_workspace_.Computed->data();
+    trmm_workspace_.arguments.alpha = problem_.alpha.data();
+    trmm_workspace_.arguments.beta = problem_.beta.data();
+    trmm_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
+
+    results_.back().status = profile_cutlass_(
+      results_.back().runtime,
+      options,
+      operation,
+      &trmm_workspace_.arguments,
+      trmm_workspace_.host_workspace.data(),
+      trmm_workspace_.device_workspace.data()
+    );
+  }
+  return true;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace profiler
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tools/profiler/src/trmm_operation_profiler.h b/tools/profiler/src/trmm_operation_profiler.h
new file mode 100644
index 00000000..d1abd53b
--- /dev/null
+++ b/tools/profiler/src/trmm_operation_profiler.h
@@ -0,0 +1,222 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Defines a math function
+
+  
+*/
+
+#pragma once
+
+#include <vector>
+#include <string>
+#include <memory>
+#include <algorithm>
+#include <unordered_map>
+
+// CUTLASS Library includes
+#include "cutlass/blas3.h"
+#include "cutlass/library/library.h"
+#include "cutlass/library/util.h"
+#include "cutlass/library/manifest.h"
+
+// Profiler includes
+#include "options.h"
+#include "device_context.h"
+#include "operation_profiler.h"
+#include "performance_result.h"
+#include "problem_space.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace profiler {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Abstract base class for each math function
+class TrmmOperationProfiler : public OperationProfiler {
+public:
+
+  /// Problem structure obtained from problem space
+  struct TrmmProblem {
+    int64_t m;
+    int64_t n;
+    int64_t lda;
+    int64_t ldb;
+    int64_t ldd;
+    SideMode side_mode;
+    FillMode fill_mode;
+    DiagType diag_type;
+    std::vector<uint8_t> alpha;
+    std::vector<uint8_t> beta;
+    int64_t split_k_slices;
+    int64_t batch_count;
+
+    //
+    // Methods
+    //
+
+    TrmmProblem(): 
+      m(16), n(16), lda(0), ldb(0),  ldd(0), split_k_slices(1), batch_count(1) { }
+
+    /// Parses the problem
+    Status parse(
+      library::TrmmDescription const &operation_desc,
+      ProblemSpace const &problem_space,
+      ProblemSpace::Problem const &problem);
+
+    /// Initializes a performance result
+    void initialize_result(
+      PerformanceResult &result,
+      library::TrmmDescription const &operation_desc,
+      ProblemSpace const &problem_space);
+  };
+
+  /// Workspace used 
+  struct TrmmWorkspace {
+
+    DeviceAllocation *A;
+    DeviceAllocation *B;
+    DeviceAllocation *D;
+    DeviceAllocation *Computed;
+    DeviceAllocation *Reference;
+
+    library::TrmmConfiguration configuration;
+    library::TrmmArguments arguments;
+
+    /// Buffer used for the operation's host workspace
+    std::vector<uint8_t> host_workspace;
+
+    /// Buffer used for the operations' device workspace
+    DeviceAllocation device_workspace;
+
+    //
+    // Methods
+    //
+
+    TrmmWorkspace(): 
+      A(nullptr), B(nullptr), D(nullptr), Computed(nullptr), Reference(nullptr) { }
+  };
+
+protected:
+
+  //
+  // Data members
+  //
+
+  /// GEMM problem obtained from problem space
+  TrmmProblem problem_;
+
+  /// Device memory allocations 
+  TrmmWorkspace trmm_workspace_;
+
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  TrmmOperationProfiler(Options const &options);
+
+  /// Destructor
+  virtual ~TrmmOperationProfiler();
+
+  /// Prints usage statement for the math function
+  virtual void print_usage(std::ostream &out) const;
+
+  /// Prints examples
+  virtual void print_examples(std::ostream &out) const;
+
+  /// Extracts the problem dimensions
+  virtual Status initialize_configuration(
+    Options const &options, 
+    PerformanceReport &report, 
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Initializes workspace
+  virtual Status initialize_workspace(
+    Options const &options, 
+    PerformanceReport &report, 
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Verifies CUTLASS against references
+  virtual bool verify_cutlass(
+    Options const &options,  
+    PerformanceReport &report,
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Measures performance results
+  virtual bool profile(
+    Options const &options, 
+    PerformanceReport &report, 
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+protected:
+
+  /// Initializes the performance result
+  void initialize_result_(
+    PerformanceResult &result,
+    Options const &options,  
+    library::TrmmDescription const &operation_desc,
+    ProblemSpace const &problem_space);
+
+  /// Verifies CUTLASS against references
+  bool verify_with_cublas_(
+    Options const &options,  
+    PerformanceReport &report,
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace profiler
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/tools/util/CMakeLists.txt b/tools/util/CMakeLists.txt
index cac97226..2d2333e1 100644
--- a/tools/util/CMakeLists.txt
+++ b/tools/util/CMakeLists.txt
@@ -1,23 +1,29 @@
-# Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
 #
-# Redistribution and use in source and binary forms, with or without modification, are permitted
-# provided that the following conditions are met:
-#     * Redistributions of source code must retain the above copyright notice, this list of
-#       conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright notice, this list of
-#       conditions and the following disclaimer in the documentation and/or other materials
-#       provided with the distribution.
-#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-#       to endorse or promote products derived from this software without specific prior written
-#       permission.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 add_library(cutlass_tools_util_includes INTERFACE)
diff --git a/tools/util/include/cutlass/util/command_line.h b/tools/util/include/cutlass/util/command_line.h
index 83270f8a..8d440d4e 100644
--- a/tools/util/include/cutlass/util/command_line.h
+++ b/tools/util/include/cutlass/util/command_line.h
@@ -1,19 +1,31 @@
 /******************************************************************************
- * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
- * modification, are not permitted.
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  ******************************************************************************/
 
diff --git a/tools/util/include/cutlass/util/debug.h b/tools/util/include/cutlass/util/debug.h
index e0581fcb..3eef5289 100644
--- a/tools/util/include/cutlass/util/debug.h
+++ b/tools/util/include/cutlass/util/debug.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/util/include/cutlass/util/device_dump.h b/tools/util/include/cutlass/util/device_dump.h
index cce1dc57..6fc8e741 100644
--- a/tools/util/include/cutlass/util/device_dump.h
+++ b/tools/util/include/cutlass/util/device_dump.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/util/include/cutlass/util/device_memory.h b/tools/util/include/cutlass/util/device_memory.h
index 202eb7f4..f821f979 100644
--- a/tools/util/include/cutlass/util/device_memory.h
+++ b/tools/util/include/cutlass/util/device_memory.h
@@ -1,19 +1,31 @@
 /******************************************************************************
- * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
- * modification, are not permitted.
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  ******************************************************************************/
 
diff --git a/tools/util/include/cutlass/util/device_nchw_to_nhwc.h b/tools/util/include/cutlass/util/device_nchw_to_nhwc.h
new file mode 100644
index 00000000..355fb753
--- /dev/null
+++ b/tools/util/include/cutlass/util/device_nchw_to_nhwc.h
@@ -0,0 +1,141 @@
+/******************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+/**
+ * \file
+ * \brief cuda kernels to transform a device memory tensor from NCHW layout to NHWC layout.
+ */
+
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/tensor_ref.h"
+
+namespace cutlass {
+
+/** \brief interface to transform a device memory tensor from NCHW layout to NHWC layout.
+ * \tparam T: data type
+ */
+template <typename T>
+void nchw_to_nhwc(cutlass::Tensor4DCoord input_tensor_size,
+                  cutlass::Tensor4DCoord output_tensor_size,
+                  TensorRef<T, layout::TensorNCHW> ref_input,
+                  TensorRef<T, layout::TensorNHWC> ref_output,
+                  cudaStream_t stream);
+
+template <typename T>
+__global__ void nchw_to_nhwc_kernel(T *output, 
+                                    const T *input, 
+                                    const int n,
+                                    const int h, 
+                                    const int w, 
+                                    const int c) {
+  const int hw = h*w;
+  const int chw = c*hw;
+  __shared__ T shbuf[32 * (32 + 1)]; 
+  const int32_t tid  = threadIdx.y*blockDim.x + threadIdx.x;
+  const int32_t wid  = tid / 32;     
+  const int32_t lid  = tid % 32;     
+  const int32_t ni   = blockIdx.z;
+  const int32_t ci0  = blockIdx.y * 32;
+  const int32_t hwi0 = blockIdx.x * 32;
+
+  const size_t input_idx = ni * chw + (ci0 + wid) * hw + hwi0;
+  const T *A = input + input_idx;
+  if (hwi0 + lid < hw) {
+    const int lid_x_33 = lid * 33;
+    if ((ci0 + 32) <= c) {
+      int ci = wid;  // between 0 and 7
+      CUTLASS_PRAGMA_UNROLL
+      for (int cLoopIdx = 0; cLoopIdx < 4; cLoopIdx++) {
+        shbuf[lid_x_33 + ci] = A[lid];
+        A                    = &A[8 * hw];
+        ci += 8;
+      }
+    } else {
+      for (int ci = wid; ci < 32; ci += 8) {
+        if ((ci + ci0) < c) {
+          shbuf[lid_x_33 + ci] = A[lid];
+        }
+        A = &A[8 * hw];
+      }
+    }
+  }
+  __syncthreads();
+
+  const int32_t ciOut = ci0 + lid;
+  output = &output[ni * chw + ciOut];
+  if (ciOut < c) {
+    if (hwi0 + 32 < hw) {
+      int hwI = wid;
+      CUTLASS_PRAGMA_UNROLL
+      for (int hwLoopIdx = 0; hwLoopIdx < 4; ++hwLoopIdx) {
+        output[(hwi0 + hwI) * c] = shbuf[(hwI)*33 + lid];
+        hwI += 8;
+      }
+    } else {
+      for (int hwI = wid; hwI < 32; hwI += 8) {
+        if (hwi0 + hwI < hw) {
+          output[(hwi0 + hwI) * c] = shbuf[(hwI)*33 + lid];
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void nchw_to_nhwc(cutlass::Tensor4DCoord input_tensor_size,
+                  cutlass::Tensor4DCoord output_tensor_size,
+                  TensorRef<T, layout::TensorNCHW> ref_input,
+                  TensorRef<T, layout::TensorNHWC> ref_output,
+                  cudaStream_t stream) {
+  
+  assert(
+    input_tensor_size.n() == output_tensor_size.n() &&
+    input_tensor_size.c() == output_tensor_size.h() &&
+    input_tensor_size.h() == output_tensor_size.w() &&
+    input_tensor_size.w() == output_tensor_size.c());
+
+  int n = output_tensor_size.n();
+  int h = output_tensor_size.h();
+  int w = output_tensor_size.w();
+  int c = output_tensor_size.c();
+  
+  dim3 grid((h*w + 31)/32, (c + 31)/32, n);
+  dim3 block(32, 8);
+  nchw_to_nhwc_kernel<<<grid, block, 0, stream>>>(ref_output.data(), ref_input.data(), 
+                                                  n, h, w, c);
+}
+
+} //namespace cutlass
diff --git a/tools/util/include/cutlass/util/device_nhwc_padding.h b/tools/util/include/cutlass/util/device_nhwc_padding.h
new file mode 100644
index 00000000..5a9ae7ef
--- /dev/null
+++ b/tools/util/include/cutlass/util/device_nhwc_padding.h
@@ -0,0 +1,276 @@
+/******************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+/**
+ * \file
+ * \brief cuda kernels for padding in device memory with NHWC layout.
+ */
+
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/tensor_ref.h"
+
+namespace cutlass {
+
+/** \brief interface for padding in a device memory tensor with NHWC layout
+ * \tparam T: data type
+ */
+template <typename T>
+void nhwc_padding(cutlass::Tensor4DCoord input_tensor_size,
+                  cutlass::Tensor4DCoord output_tensor_size,
+                  TensorRef<T, layout::TensorNHWC> ref_input,
+                  TensorRef<T, layout::TensorNHWC> ref_output,
+                  cudaStream_t stream);
+
+
+template <typename T>
+__global__ void nhwc_padding_kernel(const int32_t n,
+                                    const int32_t h,
+                                    const int32_t w,
+                                    const int32_t c_in,
+                                    const int32_t c_out,
+                                    const T zero,
+                                    const T *input,
+                                    T *output){
+
+  const int32_t idx_jump       = blockDim.x * gridDim.x;
+  const int32_t total_elements = n * h * w * c_out;
+
+  int32_t c_idx, w_idx, h_idx, n_idx, resudial;
+
+  T value;
+  for (int32_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < total_elements; idx += idx_jump) {
+        
+    c_idx = idx%c_out;
+    if (c_idx >= c_in){
+      value = zero;    
+    }
+    else{
+      resudial = idx/c_out;
+      w_idx = resudial%w;
+      resudial = resudial/w;
+      h_idx = resudial%h;
+      n_idx = resudial/h;	
+      resudial = ((n_idx * h + h_idx) * w + w_idx) * c_in + c_idx;
+      value = input[resudial];
+    }
+    output[idx] = value;
+  }
+}
+
+
+// fast kernel for c_in = 3 & c_out = 4
+template <typename Tio, typename Telement, int element_in_Tio>
+__global__ void nhwc_padding_channel_3To4_kernel(const int32_t n,
+                                                 const int32_t h,
+                                                 const int32_t w,
+                                                 const Tio *input,
+                                                 Tio *output,
+                                                 const int32_t max_output_element,
+                                                 const int32_t max_input_element,
+                                                 const Tio zero_io,
+                                                 const Telement zero_element){                                                
+  __shared__ Tio shm[192];
+  const int tidx = blockIdx.x * 192 + threadIdx.x;  
+  const int threadidx = threadIdx.x; 
+
+  shm[threadIdx.x] = tidx >= max_input_element ? zero_io : input[tidx];  
+  __syncthreads();
+  
+  const int ouput_offset = blockIdx.x * 256;
+  const int lower_bound = max_output_element < ouput_offset + 256 ? max_output_element : ouput_offset + 256;
+  for (int i = ouput_offset + threadidx, j = threadidx ; i < lower_bound ; i+=192, j+=192)
+  {
+    const Telement* shm_element = (const Telement*)shm + j*3*element_in_Tio/4;
+    Telement array[element_in_Tio];
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0 ; k < element_in_Tio ; k++)
+      array[k] = ((k+1)%4 == 0) ? zero_element : shm_element[(k > 3) ? (k - 1) : k];
+    output[i] = *((const Tio *)array);
+  }
+}
+
+// fast kernel for c_in = 3 & c_out = 8
+template <typename Tio, typename Telement, int element_in_Tio>
+__global__ void nhwc_padding_channel_3To8_kernel(const int32_t n,
+                                                 const int32_t h,
+                                                 const int32_t w,
+                                                 const Tio *input,
+                                                 Tio *output,
+                                                 const int32_t max_output_element,
+                                                 const int32_t max_input_element,
+                                                 const Tio zero_io,
+                                                 const Telement zero_element){                                                
+  __shared__ Tio shm[192];
+  const int tidx = blockIdx.x * 192 + threadIdx.x;  
+  const int threadidx = threadIdx.x; 
+
+  shm[threadIdx.x] = tidx >= max_input_element ? zero_io : input[tidx];  
+  __syncthreads();
+  
+  const int ouput_offset = blockIdx.x * 512;
+  const int lower_bound = max_output_element < ouput_offset + 512 ? max_output_element : ouput_offset + 512;
+  for (int i = ouput_offset + threadidx, j = threadidx ; i < lower_bound ; i+=192, j+=192)
+  {
+    const Telement* shm_element = (const Telement*)shm + (element_in_Tio == 4 ? j/2 : j)*3;
+    Telement array[element_in_Tio];
+    //float
+    if (element_in_Tio == 4){
+      CUTLASS_PRAGMA_UNROLL
+      for (int k = 0 ; k < element_in_Tio ; k++)
+        array[k] = ((j % 2) == 1) ? zero_element : ((k >= 3) ? zero_element : shm_element[k]);
+    }
+    //half
+    else{
+      CUTLASS_PRAGMA_UNROLL
+      for (int k = 0 ; k < element_in_Tio ; k++) 
+        array[k] = (k >= 3) ? zero_element : shm_element[k];          
+    }
+    output[i] = *((const Tio *)array);
+  }
+}
+
+template <typename T>
+void nhwc_padding(cutlass::Tensor4DCoord input_tensor_size,
+                  cutlass::Tensor4DCoord output_tensor_size,
+                  TensorRef<T, layout::TensorNHWC> ref_input,
+                  TensorRef<T, layout::TensorNHWC> ref_output,
+                  cudaStream_t stream){
+  assert(
+    input_tensor_size.n() == output_tensor_size.n() &&
+    input_tensor_size.h() == output_tensor_size.h() &&
+    input_tensor_size.w() == output_tensor_size.w() &&
+    input_tensor_size.c() <= output_tensor_size.c()); 
+    
+  int n = input_tensor_size.n();
+  int h = input_tensor_size.h();
+  int w = input_tensor_size.w();
+  int c_in = input_tensor_size.c();
+  int c_out = output_tensor_size.c();
+    
+  //case 1 : channel == 3 padding to 4 or 8
+  if ((c_out == 4 || c_out == 8) && c_in == 3 && (n*h*w % 8 == 0)){
+    dim3 block(192);
+    const int nhw = n*h*w;
+    const int nhwc = nhw*c_in;
+    //for half_t
+    if (cutlass::sizeof_bits<T>::value == 16){
+      const int element_in_Tio = 8;
+      const int max_input_element = nhwc/element_in_Tio;
+      const int max_output_element = nhw*c_out/element_in_Tio;
+      const int4 zero_io = {0, 0, 0, 0};
+      const half_t zero_element = static_cast<half_t>(0.0f);
+      dim3 grid((nhwc + 192*element_in_Tio - 1)/(192*element_in_Tio));
+      if (c_out == 4){
+        nhwc_padding_channel_3To4_kernel<int4, half_t, element_in_Tio><<<grid, block, 0, stream>>>
+          (n, h, w,
+          (const int4 *)ref_input.data(),
+          (int4 *)ref_output.data(),
+          max_output_element,
+          max_input_element,
+          zero_io,
+          zero_element);
+      }
+      else if (c_out == 8){
+        nhwc_padding_channel_3To8_kernel<int4, half_t, element_in_Tio><<<grid, block, 0, stream>>>
+          (n, h, w,
+          (const int4 *)ref_input.data(),
+          (int4 *)ref_output.data(),
+          max_output_element,
+          max_input_element,
+          zero_io,
+          zero_element);
+      }
+    }
+    //for float
+    else{
+      const int element_in_Tio = 4;
+      const int max_input_element = nhwc/element_in_Tio;
+      const int max_output_element = nhw*c_out/element_in_Tio;
+      const float4 zero_io = {0.0f, 0.0f, 0.0f, 0.0f};
+      const float zero_element = 0.0f;
+      dim3 grid((nhwc + 192*element_in_Tio - 1)/(192*element_in_Tio));
+      if (c_out == 4){
+        nhwc_padding_channel_3To4_kernel<float4, float, element_in_Tio><<<grid, block, 0, stream>>>
+          (n, h, w,
+          (const float4 *)ref_input.data(),
+          (float4 *)ref_output.data(),
+          max_output_element,
+          max_input_element,
+          zero_io,
+          zero_element);
+      }
+      else if (c_out == 8){
+        nhwc_padding_channel_3To8_kernel<float4, float, element_in_Tio><<<grid, block, 0, stream>>>
+          (n, h, w,
+          (const float4 *)ref_input.data(),
+          (float4 *)ref_output.data(),
+          max_output_element,
+          max_input_element,
+          zero_io,
+          zero_element);
+      }
+    }
+  }
+  //case 2 : even channel
+  else if ((c_out % 2) == 0 && (c_in % 2) == 0){
+    int32_t total_elements = n * h * w * c_out / 2;
+    int block_size = 256;
+    dim3 grid((total_elements + 255)/256);
+    dim3 block(block_size);
+    //for half_t
+    if (cutlass::sizeof_bits<T>::value == 16){
+      const __half2 zero  = {0.0f, 0.0f};
+      nhwc_padding_kernel<<<grid, block, 0, stream>>>(n, h, w, c_in/2, c_out/2, zero, (const __half2*)ref_input.data(), (__half2*)ref_output.data());
+    }
+    //for float
+    else{
+      const float2 zero  = {0.0f, 0.0f};
+      nhwc_padding_kernel<<<grid, block, 0, stream>>>(n, h, w, c_in/2, c_out/2, zero, (const float2*)ref_input.data(), (float2*)ref_output.data());
+    }
+  }
+  //case 3 : odd channel
+  else{
+    int32_t total_elements = n * h * w * c_out;
+    int block_size = 256;
+    dim3 grid((total_elements + 255)/256);
+    dim3 block(block_size);
+    const T zero = static_cast<T>(0.0f);
+    nhwc_padding_kernel<<<grid, block, 0, stream>>>(n, h, w, c_in, c_out, zero, ref_input.data(), ref_output.data());
+  }
+}
+
+
+} //namespace cutlass
diff --git a/tools/util/include/cutlass/util/device_nhwc_to_nchw.h b/tools/util/include/cutlass/util/device_nhwc_to_nchw.h
new file mode 100644
index 00000000..1ec3c72c
--- /dev/null
+++ b/tools/util/include/cutlass/util/device_nhwc_to_nchw.h
@@ -0,0 +1,144 @@
+/******************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+/**
+ * \file
+ * \brief cuda kernels to transform a device memory tensor from NHWC layout to NCHW layout.
+ */
+
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/tensor_ref.h"
+
+namespace cutlass {
+
+/** \brief interface to transform a device memory tensor from NHWC layout to NCHW layout.
+ * \tparam T: data type
+ */
+template <typename T>
+void nhwc_to_nchw(cutlass::Tensor4DCoord input_tensor_size,
+                  cutlass::Tensor4DCoord output_tensor_size,
+                  TensorRef<T, layout::TensorNHWC> ref_input,
+                  TensorRef<T, layout::TensorNCHW> ref_output,
+                  cudaStream_t stream);
+
+
+template <typename T>
+__global__ void nhwc_to_nchw_kernel(T *output, 
+                                    const T *input, 
+                                    const int n,
+                                    const int h, 
+                                    const int w, 
+                                    const int c) {
+ 
+  const int hw = h*w;
+  const int hwc = hw*c;
+  __shared__ T shbuf[32 * (32 + 1)]; 
+  const int32_t tid  = threadIdx.y*blockDim.x + threadIdx.x;
+  const int32_t wid  = tid / 32; 
+  const int32_t lid  = tid % 32; 
+  const int32_t ni   = blockIdx.z;
+  const int32_t hwi0  = blockIdx.y * 32;  
+  const int32_t ci0 = blockIdx.x * 32;  
+
+  const size_t input_idx = ni * hwc + (hwi0 + wid) * c + ci0;
+  const T *A = input + input_idx;
+  if (ci0 + lid < c) {
+    const int lid_x_33 = lid * 33;
+    if ((hwi0 + 32) <= hw) {
+      int hwi = wid;  // between 0 and 7
+      CUTLASS_PRAGMA_UNROLL
+      for (int cLoopIdx = 0; cLoopIdx < 4; cLoopIdx++) { 
+        shbuf[lid_x_33 + hwi] = A[lid];
+        A                     = &A[8 * c];
+        hwi += 8;
+      }
+    } else {
+      for (int hwi = wid; hwi < 32; hwi += 8) { 
+        if ((hwi + hwi0) < hw) {
+          shbuf[lid_x_33 + hwi] = A[lid];
+        }
+        A = &A[8 * c];
+      }
+    }
+  }
+  __syncthreads();
+
+  const int32_t hwiOut = hwi0 + lid;
+  output = &output[ni * hwc + hwiOut];
+  if (hwiOut < hw) {
+    if (ci0 + 32 < c) {
+      int cI = wid;
+      CUTLASS_PRAGMA_UNROLL
+      for (int hwLoopIdx = 0; hwLoopIdx < 4; ++hwLoopIdx) {
+        output[(ci0 + cI) * hw] = shbuf[(cI)*33 + lid];
+        cI += 8;
+      }
+    } else {
+      for (int cI = wid; cI < 32; cI += 8) {
+        if (ci0 + cI < c) {
+          output[(ci0 + cI) * hw] = shbuf[(cI)*33 + lid];
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void nhwc_to_nchw(cutlass::Tensor4DCoord input_tensor_size,
+                  cutlass::Tensor4DCoord output_tensor_size,
+                  TensorRef<T, layout::TensorNHWC> ref_input,
+                  TensorRef<T, layout::TensorNCHW> ref_output,
+                  cudaStream_t stream) {
+  
+  assert(
+    input_tensor_size.n() == output_tensor_size.n() &&
+    input_tensor_size.h() == output_tensor_size.c() &&
+    input_tensor_size.w() == output_tensor_size.h() &&
+    input_tensor_size.c() == output_tensor_size.w());
+
+  int n = input_tensor_size.n();
+  int h = input_tensor_size.h();
+  int w = input_tensor_size.w();
+  int c = input_tensor_size.c();
+
+  dim3 grid((c + 31)/32, (h*w + 31)/32, n);
+  dim3 block(32, 8);
+  nhwc_to_nchw_kernel<<<grid, block, 0, stream>>>(ref_output.data(), ref_input.data(), 
+                                                  n, h, w, c);
+
+}
+
+} //namespace cutlass
diff --git a/tools/util/include/cutlass/util/distribution.h b/tools/util/include/cutlass/util/distribution.h
index b135ee05..4ebf58bb 100644
--- a/tools/util/include/cutlass/util/distribution.h
+++ b/tools/util/include/cutlass/util/distribution.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/util/include/cutlass/util/exceptions.h b/tools/util/include/cutlass/util/exceptions.h
index d8d6ef94..bbdf5e0e 100644
--- a/tools/util/include/cutlass/util/exceptions.h
+++ b/tools/util/include/cutlass/util/exceptions.h
@@ -1,19 +1,31 @@
 /******************************************************************************
- * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
- * modification, are not permitted.
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  ******************************************************************************/
 
diff --git a/tools/util/include/cutlass/util/host_reorder.h b/tools/util/include/cutlass/util/host_reorder.h
index bdb7f2e4..4f724315 100644
--- a/tools/util/include/cutlass/util/host_reorder.h
+++ b/tools/util/include/cutlass/util/host_reorder.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/util/include/cutlass/util/host_tensor.h b/tools/util/include/cutlass/util/host_tensor.h
index cc5c890a..b6a7d0a6 100644
--- a/tools/util/include/cutlass/util/host_tensor.h
+++ b/tools/util/include/cutlass/util/host_tensor.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/util/include/cutlass/util/host_tensor_planar_complex.h b/tools/util/include/cutlass/util/host_tensor_planar_complex.h
index 15066a07..54e65b46 100644
--- a/tools/util/include/cutlass/util/host_tensor_planar_complex.h
+++ b/tools/util/include/cutlass/util/host_tensor_planar_complex.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/util/include/cutlass/util/host_uncompress.h b/tools/util/include/cutlass/util/host_uncompress.h
index 3af9a47f..7fbfc3dd 100644
--- a/tools/util/include/cutlass/util/host_uncompress.h
+++ b/tools/util/include/cutlass/util/host_uncompress.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/util/include/cutlass/util/index_sequence.h b/tools/util/include/cutlass/util/index_sequence.h
index ad80dc32..600ad884 100644
--- a/tools/util/include/cutlass/util/index_sequence.h
+++ b/tools/util/include/cutlass/util/index_sequence.h
@@ -1,52 +1,38 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
 
 #pragma once
 
-#include <utility>
 #include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
 
-/**
- * \file
- * \brief C++11 version of index_sequence.
- */
-
-namespace cutlass {
-
-template <size_t... Seq>
-struct index_sequence;
-
-template <size_t N, size_t... Next>
-struct index_sequence_helper : index_sequence_helper<N - 1, N - 1, Next...> {};
-
-template <size_t... Next>
-struct index_sequence_helper<0, 0, Next...> {
-  using type = index_sequence<0, Next...>;
-};
-
-template <size_t N>
-using make_index_sequence = typename index_sequence_helper<N>::type;
+// integer_sequence moved to cutlass/numeric_types.h
 
-}  // namespace cutlass
diff --git a/tools/util/include/cutlass/util/reference/detail/inner_product.h b/tools/util/include/cutlass/util/reference/detail/inner_product.h
index 2188c46b..3735940b 100644
--- a/tools/util/include/cutlass/util/reference/detail/inner_product.h
+++ b/tools/util/include/cutlass/util/reference/detail/inner_product.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/util/include/cutlass/util/reference/detail/linear_to_coordinate.h b/tools/util/include/cutlass/util/reference/detail/linear_to_coordinate.h
index f5b33f56..aae510f8 100644
--- a/tools/util/include/cutlass/util/reference/detail/linear_to_coordinate.h
+++ b/tools/util/include/cutlass/util/reference/detail/linear_to_coordinate.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/util/include/cutlass/util/reference/device/convolution.h b/tools/util/include/cutlass/util/reference/device/convolution.h
index f6e04c1c..6f6ede63 100644
--- a/tools/util/include/cutlass/util/reference/device/convolution.h
+++ b/tools/util/include/cutlass/util/reference/device/convolution.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/util/include/cutlass/util/reference/device/gemm.h b/tools/util/include/cutlass/util/reference/device/gemm.h
index 02917ffc..44488083 100644
--- a/tools/util/include/cutlass/util/reference/device/gemm.h
+++ b/tools/util/include/cutlass/util/reference/device/gemm.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/util/include/cutlass/util/reference/device/gemm_complex.h b/tools/util/include/cutlass/util/reference/device/gemm_complex.h
index 80143bb9..39bb7aa0 100644
--- a/tools/util/include/cutlass/util/reference/device/gemm_complex.h
+++ b/tools/util/include/cutlass/util/reference/device/gemm_complex.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -227,36 +233,80 @@ void GemmComplex(
     batch_count % std::numeric_limits<uint16_t>::max()
   );
 
-  kernel::GemmComplex<
-    ElementA,
-    LayoutA,
-    ElementB,
-    LayoutB,
-    ElementC,
-    LayoutC,
-    ScalarType,
-    ComputeType,
-    ConvertOp,
-    InnerProductOp,
-    kMblock,
-    kNblock
-  ><<< grid, block >>>(
-    problem_size,
-    alpha,
-    tensor_a,
-    transform_a,
-    tensor_b,
-    transform_b,
-    beta,
-    tensor_c,
-    tensor_d,
-    initial_accum,
-    batch_count,
-    batch_stride_A,
-    batch_stride_B,
-    batch_stride_C,
-    batch_stride_D
-  );
+  if (grid.y <= std::numeric_limits<uint16_t>::max()) {
+    kernel::GemmComplex<
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementC,
+      LayoutC,
+      ScalarType,
+      ComputeType,
+      ConvertOp,
+      InnerProductOp,
+      kMblock,
+      kNblock
+    ><<< grid, block >>>(
+      problem_size,
+      alpha,
+      tensor_a,
+      transform_a,
+      tensor_b,
+      transform_b,
+      beta,
+      tensor_c,
+      tensor_d,
+      initial_accum,
+      batch_count,
+      batch_stride_A,
+      batch_stride_B,
+      batch_stride_C,
+      batch_stride_D
+    );
+  } else {
+    // Using bigger thread tile size
+    int const kBigMblock = 4;
+    int const kBigNblock = 16;
+
+    dim3 Bigblock(16, 8);
+    dim3 Biggrid(
+      (problem_size.m() + block.x * kBigMblock - 1) / (block.x * kBigMblock),
+      (problem_size.n() + block.y * kBigNblock - 1) / (block.y * kBigNblock),
+      batch_count % std::numeric_limits<uint16_t>::max()
+    );
+
+    kernel::GemmComplex<
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementC,
+      LayoutC,
+      ScalarType,
+      ComputeType,
+      ConvertOp,
+      InnerProductOp,
+      kBigMblock,
+      kBigNblock
+    ><<< Biggrid, Bigblock >>>(
+      problem_size,
+      alpha,
+      tensor_a,
+      transform_a,
+      tensor_b,
+      transform_b,
+      beta,
+      tensor_c,
+      tensor_d,
+      initial_accum,
+      batch_count,
+      batch_stride_A,
+      batch_stride_B,
+      batch_stride_C,
+      batch_stride_D
+    );
+  }
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tools/util/include/cutlass/util/reference/device/gemm_planar_complex.h b/tools/util/include/cutlass/util/reference/device/gemm_planar_complex.h
index 9ac24add..7a387eaa 100644
--- a/tools/util/include/cutlass/util/reference/device/gemm_planar_complex.h
+++ b/tools/util/include/cutlass/util/reference/device/gemm_planar_complex.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/util/include/cutlass/util/reference/device/kernel/gemm.h b/tools/util/include/cutlass/util/reference/device/kernel/gemm.h
index beca9e01..524fc214 100644
--- a/tools/util/include/cutlass/util/reference/device/kernel/gemm.h
+++ b/tools/util/include/cutlass/util/reference/device/kernel/gemm.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/util/include/cutlass/util/reference/device/kernel/tensor_elementwise.h b/tools/util/include/cutlass/util/reference/device/kernel/tensor_elementwise.h
index 5860d1f9..80a689fb 100644
--- a/tools/util/include/cutlass/util/reference/device/kernel/tensor_elementwise.h
+++ b/tools/util/include/cutlass/util/reference/device/kernel/tensor_elementwise.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h b/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h
index ce6f9eba..e9608253 100644
--- a/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h
+++ b/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/util/include/cutlass/util/reference/device/rank_2k_complex.h b/tools/util/include/cutlass/util/reference/device/rank_2k_complex.h
new file mode 100644
index 00000000..0e159ee9
--- /dev/null
+++ b/tools/util/include/cutlass/util/reference/device/rank_2k_complex.h
@@ -0,0 +1,355 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Reference implementation for complex-valued GEMM in device-side code.
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+#include "cutlass/complex.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/gemm/gemm.h"
+
+namespace cutlass {
+namespace reference {
+namespace device {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace kernel {
+
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+///
+/// Explicitly naming types needed by this template can be cumbersome, particularly for the
+/// accumulator type, so a function argument 'initial_accum' is exposed. Passing
+/// AccumulatorType(0) as the last function argument can be easier than naming all template
+/// arguments explicitly.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename ComputeType,
+  typename ConvertOp = NumericConverter<ElementC, ScalarType>,
+  typename InnerProductOp = multiply_add<ComputeType>,
+  int kMblock = 4,
+  int kNblock = 4
+>
+__global__ void Rank2KComplex(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRef<ElementA, LayoutA> tensor_a,
+  ComplexTransform transform_a,
+  TensorRef<ElementB, LayoutB> tensor_b,
+  ComplexTransform transform_b,
+  ScalarType beta,
+  TensorRef<ElementC, LayoutC> tensor_c,
+  TensorRef<ElementC, LayoutC> tensor_d,
+  ComputeType initial_accum,
+  FillMode fill_mode_c,
+  BlasMode blas_mode,
+  int batch_count = 1,
+  int64_t batch_stride_A = 0,
+  int64_t batch_stride_B = 0,
+  int64_t batch_stride_C = 0,
+  int64_t batch_stride_D = 0) {
+
+  static_assert(
+    LayoutA::kRank == 2 &&
+    LayoutB::kRank == 2 &&
+    LayoutC::kRank == 2, "Tensors must be of rank 2");
+
+  int const M = problem_size.m();
+  int const N = problem_size.n();
+  int const K = problem_size.k();
+
+  assert(M=N);
+
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+  
+  int row_block = (blockIdx.x * blockDim.x + threadIdx.x) * kMblock;
+  int col_block = (blockIdx.y * blockDim.y + threadIdx.y) * kNblock; 
+  int batch_idx = blockIdx.z;
+
+  tensor_a.add_pointer_offset(batch_idx * batch_stride_A);
+  tensor_b.add_pointer_offset(batch_idx * batch_stride_B);
+  tensor_c.add_pointer_offset(batch_idx * batch_stride_C);
+  tensor_d.add_pointer_offset(batch_idx * batch_stride_D);
+
+  for (; batch_idx < batch_count; batch_idx += gridDim.z) {
+
+    // Compute matrix product using blocks
+    ComputeType accum[kMblock][kNblock];
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < kNblock; j++) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kMblock; i++) {
+        accum[i][j] = initial_accum;
+      }
+    }
+
+    for (int k_block = 0; k_block < K; ++k_block) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < kNblock; j++) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < kMblock; i++) {
+          int row = row_block + i;
+          int col = col_block + j;
+
+          if (row < M && col < N &&
+             ( (fill_mode_c == FillMode::kLower && row >= col) || 
+              (fill_mode_c == FillMode::kUpper && row <= col) )               
+            ) {
+
+            // A x B^T (Symmetric) or A x B^H (Hermitian)
+            // complex conjugation on operandB (b_t) is function of blas3 computation
+            ElementA a = tensor_a.at(MatrixCoord(row, k_block));
+            ElementB b_t = (blas_mode == BlasMode::kHermitian) ? 
+                          conj(tensor_b.at(MatrixCoord(col, k_block))) : 
+                          tensor_b.at(MatrixCoord(col, k_block));
+
+            ComputeType a_ik = ComputeType(a);
+            ComputeType b_jk = ComputeType(b_t);
+
+            // complex conjugation is a function of operand layouts
+            if (transform_a == ComplexTransform::kConjugate) {
+              a_ik = conj(a_ik);
+            }
+            // complex conjugation is a function of operand layouts
+            if (transform_b == ComplexTransform::kConjugate) {
+              b_jk = conj(b_jk);
+            }
+
+            accum[i][j] = inner_product_op(a_ik, b_jk,  accum[i][j]);
+
+            // B x A^T (Symmetric) or B x A^H (Hermitian)
+            // complex conjugation on operandB (a_t) is function of blas3 computation
+            ElementB b = tensor_b.at(MatrixCoord(row, k_block));
+            ElementA a_t = (blas_mode == BlasMode::kHermitian) ? 
+                            conj(tensor_a.at(MatrixCoord(col, k_block))):
+                            tensor_a.at(MatrixCoord(col, k_block));
+
+            ComputeType b_ik = ComputeType(b);
+            ComputeType a_jk = ComputeType(a_t);
+            
+            // complex conjugation here is a function of operand layouts
+            if (transform_b == ComplexTransform::kConjugate) {
+              b_ik = conj(b_ik);
+            }
+            // complex conjugation here is a function of operand layouts
+            if (transform_a == ComplexTransform::kConjugate) {
+              a_jk = conj(a_jk);
+            }
+
+            accum[i][j] = inner_product_op(a_ik, b_kj,  accum[i][j]);
+          }
+        }
+      }
+    }
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < kNblock; j++) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kMblock; i++) {
+        int row = row_block + i;
+        int col = col_block + j;
+
+        MatrixCoord coord = MatrixCoord(row, col);
+
+        if (row < M && col < N && 
+            ((fill_mode_c == FillMode::kLower && row >= col) || 
+             (fill_mode_c == FillMode::kUpper && row <= col))
+          ) {
+
+          ScalarType c = tensor_c.at(coord);
+          // The imaginary parts of the diagonal elements of 
+          // a complex data type are assumed and set to zero
+          if (blas_mode == BlasMode::kHermitian) {
+            c = (row == col) ? real(c) : c;
+          }
+
+          tensor_d.at(coord) = convert_op(
+            alpha * ScalarType(accum[i][j]) + 
+            beta * c);
+        }
+      }
+    }
+
+    tensor_a.add_pointer_offset(batch_stride_A * gridDim.z);
+    tensor_b.add_pointer_offset(batch_stride_B * gridDim.z);
+    tensor_c.add_pointer_offset(batch_stride_C * gridDim.z);
+    tensor_d.add_pointer_offset(batch_stride_D * gridDim.z);
+
+  } // for (batch_idx)
+}
+
+} // namespace kernel
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+///
+/// Explicitly naming types needed by this template can be cumbersome, particularly for the
+/// accumulator type, so a function argument 'initial_accum' is exposed. Passing
+/// AccumulatorType(0) as the last function argument can be easier than naming all template
+/// arguments explicitly.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename ComputeType,
+  typename ConvertOp = NumericConverter<ElementC, ScalarType>,
+  typename InnerProductOp = multiply_add<ComputeType>
+>
+void Rank2KComplex(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRef<ElementA, LayoutA> tensor_a,
+  ComplexTransform transform_a,
+  TensorRef<ElementB, LayoutB> tensor_b,
+  ComplexTransform transform_b,
+  ScalarType beta,
+  TensorRef<ElementC, LayoutC> tensor_c,
+  TensorRef<ElementC, LayoutC> tensor_d,
+  ComputeType initial_accum,
+  FillMode fill_mode_c,
+  BlasMode blas_mode,
+  int batch_count = 1,
+  int64_t batch_stride_A = 0,
+  int64_t batch_stride_B = 0,
+  int64_t batch_stride_C = 0,
+  int64_t batch_stride_D = 0) {
+
+  static_assert(
+    LayoutA::kRank == 2 &&
+    LayoutB::kRank == 2 &&
+    LayoutC::kRank == 2, "Tensors must be of rank 2");
+ 
+  int const kMblock = 4;
+  int const kNblock = 4;
+
+  dim3 block(16, 8);
+  dim3 grid(
+    (problem_size.m() + block.x * kMblock - 1) / (block.x * kMblock),
+    (problem_size.n() + block.y * kNblock - 1) / (block.y * kNblock),
+    batch_count % std::numeric_limits<uint16_t>::max()
+  );
+
+  kernel::Rank2KComplex<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ScalarType,
+    ComputeType,
+    ConvertOp,
+    InnerProductOp,
+    kMblock,
+    kNblock
+  ><<< grid, block >>>(
+    problem_size,
+    alpha,
+    tensor_a,
+    transform_a,
+    tensor_b,
+    transform_b,
+    beta,
+    tensor_c,
+    tensor_d,
+    initial_accum,
+    fill_mode_c,
+    blas_mode,
+    batch_count,
+    batch_stride_A,
+    batch_stride_B,
+    batch_stride_C,
+    batch_stride_D
+  );
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+///
+/// This assumes the accumulator type is the same type as the scalars.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType
+>
+void Rank2KComplex(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRef<ElementA, LayoutA> tensor_a,
+  ComplexTransform transform_a,
+  TensorRef<ElementB, LayoutB> tensor_b,
+  ComplexTransform transform_b,
+  ScalarType beta,
+  TensorRef<ElementC, LayoutC> tensor_c,
+  TensorRef<ElementC, LayoutC> tensor_d,
+  FillMode fill_mode_c,
+  BlasMode blas_mode) {
+
+  Rank2KComplex(    
+    problem_size, alpha, 
+    tensor_a, transform_a, 
+    tensor_b, transform_b, 
+    beta, tensor_c, tensor_d, 
+    ScalarType(0),
+    fill_mode_c,
+    blas_mode);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace reference
+} // namespace cutlass
diff --git a/tools/util/include/cutlass/util/reference/device/tensor_compare.h b/tools/util/include/cutlass/util/reference/device/tensor_compare.h
index a28b95bf..4db1efa2 100644
--- a/tools/util/include/cutlass/util/reference/device/tensor_compare.h
+++ b/tools/util/include/cutlass/util/reference/device/tensor_compare.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/util/include/cutlass/util/reference/device/tensor_fill.h b/tools/util/include/cutlass/util/reference/device/tensor_fill.h
index 6612d8e3..12480c42 100644
--- a/tools/util/include/cutlass/util/reference/device/tensor_fill.h
+++ b/tools/util/include/cutlass/util/reference/device/tensor_fill.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -50,6 +56,8 @@
 #include "cutlass/array.h"
 #include "cutlass/complex.h"
 #include "cutlass/tensor_view.h"
+#include "cutlass/blas3.h"
+
 #include "cutlass/util/reference/device/tensor_foreach.h"
 #include "cutlass/util/distribution.h"
 
@@ -1006,6 +1014,224 @@ struct TensorFillDiagonalFunc {
   }
 };
 
+// Overwrites the elements of a tensor with a uniform value depending on fill mode
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorFillPartialFunc {
+
+  /// View type
+  using TensorView = TensorView<Element, Layout>;
+
+  /// Scalar type
+  typedef typename TensorView::Element T;
+
+  /// Coordinate in tensor's index space
+  typedef typename TensorView::TensorCoord TensorCoord;
+
+  /// Parameters structure
+  struct Params {
+
+    //
+    // Data members
+    //
+
+    TensorView view;
+    Element element;
+    FillMode fill_mode;
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params(): fill_mode(FillMode::kNone) { }
+
+    //
+    // Methods
+    //
+
+    /// Construction of Gaussian RNG functor.
+    Params(
+      TensorView view_,
+      Element element_,
+      FillMode fill_mode_
+    ):
+      view(view_), element(element_), fill_mode(fill_mode_) {
+
+    }
+  };
+
+  //
+  // Data members
+  //
+
+  /// Parameters object
+  Params params;
+
+  //
+  // Methods
+  //
+
+  CUTLASS_DEVICE
+  TensorFillPartialFunc(Params const &params): params(params) {
+
+  }
+
+  /// Overwrites the element if it is within the covered region.
+  CUTLASS_DEVICE
+  void operator()(TensorCoord const &coord) {
+
+    bool predicate = true;
+      
+    switch (params.fill_mode) {
+    case FillMode::kFull:
+      predicate = true;
+      break;
+
+    case FillMode::kLower:
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 1; i < Layout::kRank; ++i) {
+        if (coord[i - 1] < coord[i]) {
+          predicate = false;
+          break;
+        }
+      }
+      break;
+
+    case FillMode::kUpper:
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 1; i < Layout::kRank; ++i) {
+        if (coord[i - 1] > coord[i]) {
+          predicate = false;
+          break;
+        }
+      }
+      break;
+
+    case FillMode::kDiagonal:
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 1; i < Layout::kRank; ++i) {
+        if (coord[i - 1] != coord[i]) {
+          predicate = false;
+          break;
+        }
+      }
+      break;
+
+    case FillMode::kNone: // fall-through
+    
+    default:
+      predicate = false;
+      break;
+    }
+    
+    if (predicate) {
+      params.view.at(coord) = params.element;
+    }
+  }
+};
+
+
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorClearPartialFunc {
+
+  /// View type
+  using TensorView = TensorView<Element, Layout>;
+
+  /// Scalar type
+  typedef typename TensorView::Element T;
+
+  /// Coordinate in tensor's index space
+  typedef typename TensorView::TensorCoord TensorCoord;
+
+  /// 
+  static_assert((Layout::kRank == 2), "TensorClearPartial is only supported for matrices");
+
+  /// Parameters structure
+  struct Params {
+
+    //
+    // Data members
+    //
+
+    TensorView view;
+    Element element;
+    FillMode fill_mode;
+    int alignment;
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params(): fill_mode(FillMode::kNone) { }
+
+    //
+    // Methods
+    //
+
+    /// Construction of Gaussian RNG functor.
+    Params(
+      TensorView view_,
+      Element element_,
+      FillMode fill_mode_,
+      int alignment_
+    ):
+      view(view_), element(element_), fill_mode(fill_mode_), alignment(alignment_) {
+
+    }
+  };
+
+  //
+  // Data members
+  //
+
+  /// Parameters object
+  Params params;
+
+  //
+  // Methods
+  //
+
+  CUTLASS_DEVICE
+  TensorClearPartialFunc(Params const &params): params(params) {
+
+  }
+
+  /// Overwrites the element if it is within the covered region.
+  CUTLASS_DEVICE
+  void operator()(TensorCoord const &coord) {
+
+    bool predicate = true;
+      
+    switch (params.fill_mode) {
+
+    case FillMode::kLower:
+      if ((coord[0] >= coord[1]) || 
+          ((coord[1] - coord[0]) >= params.alignment))  {
+          predicate = false;
+        break;
+      }
+      break;
+
+    case FillMode::kUpper:
+      if ((coord[0] <= coord[1]) ||
+          ((coord[0] - coord[1]) >= params.alignment))  {
+          predicate = false;
+        break;
+      }
+      break;
+
+    case FillMode::kNone: // fall-through
+    
+    default:
+      predicate = false;
+      break;
+    }
+    
+    if (predicate) {
+      params.view.at(coord) = params.element;
+    }
+  }
+};
+
 } // namespace detail
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
@@ -1028,6 +1254,45 @@ void TensorFillDiagonal(
   );
 }
 
+/// Fills a tensor partially depending on fill mode. Elements not covered by the fillmode are
+/// not written.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorFillPartial(
+  TensorView<Element, Layout> view,       ///< destination tensor
+  Element element,
+  FillMode fill_mode) {
+  
+  typedef detail::TensorFillPartialFunc<Element, Layout> Func;
+  typedef typename Func::Params Params;
+
+  TensorForEach<Func, Layout::kRank, Params>(
+    view.extent(),
+    Params(view, element, fill_mode)
+  );
+}
+
+/// Clears a tensor partially depending on fill mode and alignment. Elements on the wrong-side 
+/// of fillmode (upto the alignment) are overwritten with the user supplied element (typically zeros)
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorClearPartial(
+  TensorView<Element, Layout> view,       ///< destination tensor
+  Element element,
+  FillMode fill_mode,
+  int alignment) {
+  
+  typedef detail::TensorClearPartialFunc<Element, Layout> Func;
+  typedef typename Func::Params Params;
+
+  TensorForEach<Func, Layout::kRank, Params>(
+    view.extent(),
+    Params(view, element, fill_mode, alignment)
+  );
+}
+
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Fills a tensor with a uniform value
diff --git a/tools/util/include/cutlass/util/reference/device/tensor_foreach.h b/tools/util/include/cutlass/util/reference/device/tensor_foreach.h
index 73566ae0..126d5b00 100644
--- a/tools/util/include/cutlass/util/reference/device/tensor_foreach.h
+++ b/tools/util/include/cutlass/util/reference/device/tensor_foreach.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/util/include/cutlass/util/reference/device/tensor_reduce.h b/tools/util/include/cutlass/util/reference/device/tensor_reduce.h
index 79422e34..51a57221 100644
--- a/tools/util/include/cutlass/util/reference/device/tensor_reduce.h
+++ b/tools/util/include/cutlass/util/reference/device/tensor_reduce.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/util/include/cutlass/util/reference/device/tensor_relu.h b/tools/util/include/cutlass/util/reference/device/tensor_relu.h
index eb8c44d0..d96f1e25 100644
--- a/tools/util/include/cutlass/util/reference/device/tensor_relu.h
+++ b/tools/util/include/cutlass/util/reference/device/tensor_relu.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/util/include/cutlass/util/reference/device/thread/gemm.h b/tools/util/include/cutlass/util/reference/device/thread/gemm.h
index 06855cf4..10c6a56c 100644
--- a/tools/util/include/cutlass/util/reference/device/thread/gemm.h
+++ b/tools/util/include/cutlass/util/reference/device/thread/gemm.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/util/include/cutlass/util/reference/host/convolution.h b/tools/util/include/cutlass/util/reference/host/convolution.h
index 94e887fc..0e395527 100644
--- a/tools/util/include/cutlass/util/reference/host/convolution.h
+++ b/tools/util/include/cutlass/util/reference/host/convolution.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/util/include/cutlass/util/reference/host/error_metrics.h b/tools/util/include/cutlass/util/reference/host/error_metrics.h
index 5f26fe59..e4c6acd6 100644
--- a/tools/util/include/cutlass/util/reference/host/error_metrics.h
+++ b/tools/util/include/cutlass/util/reference/host/error_metrics.h
@@ -1,25 +1,31 @@
 
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/util/include/cutlass/util/reference/host/gemm.h b/tools/util/include/cutlass/util/reference/host/gemm.h
index 4d7ec310..fc922254 100644
--- a/tools/util/include/cutlass/util/reference/host/gemm.h
+++ b/tools/util/include/cutlass/util/reference/host/gemm.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/util/include/cutlass/util/reference/host/gemm_complex.h b/tools/util/include/cutlass/util/reference/host/gemm_complex.h
index 567805c3..37221b09 100644
--- a/tools/util/include/cutlass/util/reference/host/gemm_complex.h
+++ b/tools/util/include/cutlass/util/reference/host/gemm_complex.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/util/include/cutlass/util/reference/host/gemm_planar_complex.h b/tools/util/include/cutlass/util/reference/host/gemm_planar_complex.h
index a5eda2ed..c1e5c094 100644
--- a/tools/util/include/cutlass/util/reference/host/gemm_planar_complex.h
+++ b/tools/util/include/cutlass/util/reference/host/gemm_planar_complex.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/util/include/cutlass/util/reference/host/rank_2k.h b/tools/util/include/cutlass/util/reference/host/rank_2k.h
new file mode 100644
index 00000000..5abcc432
--- /dev/null
+++ b/tools/util/include/cutlass/util/reference/host/rank_2k.h
@@ -0,0 +1,261 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Reference implementation for Rank 2k update in host-side code.
+    
+    
+
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/arch/mma.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+
+namespace cutlass {
+namespace reference {
+namespace host {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  FillMode FillModeC,
+  typename ScalarType,
+  typename ComputeType,
+  typename InnerProductOp = multiply_add<ComputeType>,
+  typename ConvertOp = NumericConverter<ElementC, ScalarType>
+>
+void compute_rank2k(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRef<ElementA, LayoutA> tensor_a,
+  TensorRef<ElementB, LayoutB> tensor_b,
+  ScalarType beta,
+  TensorRef<ElementC, LayoutC> tensor_c,
+  TensorRef<ElementC, LayoutC> tensor_d,
+  ComputeType initial_accum) {
+
+  static_assert(
+    LayoutA::kRank == 2 &&
+    LayoutB::kRank == 2 &&
+    LayoutC::kRank == 2, 
+    "Tensors must be of rank 2");
+
+  static_assert(
+    FillModeC == FillMode::kLower || 
+    FillModeC == FillMode::kUpper, 
+    "Fill Mode can either be Lower or Upper.");
+
+  using CompareOp = typename platform::conditional<(FillModeC == FillMode::kLower), 
+                                                    std::greater_equal<int>, 
+                                                    std::less_equal<int>>::type;
+
+  // Note: batch is ignored.
+  // Note: M is same as N for Rank 2k update
+  int const N = problem_size.n();
+  int const K = problem_size.k();
+
+  // Blocking necessary to speedup reference implementation
+  int const Nblock = 16;
+
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+  CompareOp compare_op;
+
+  for (int row_block = 0; row_block < N; row_block += Nblock) {
+    for (int col_block = 0; col_block < N; col_block += Nblock) {
+
+      ComputeType accum[Nblock][Nblock];
+
+      for (int j = 0; j < Nblock; j++) {
+        for (int i = 0; i < Nblock; i++) {
+          accum[i][j] = initial_accum;
+        }
+      }
+
+      for (int k_block = 0; k_block < K; ++k_block) {
+        for (int j = 0; j < Nblock; j++) {
+          for (int i = 0; i < Nblock; i++) {
+            int row = row_block + i;
+            int col = col_block + j;
+
+            if (row < N && col < N && compare_op(row, col)) 
+            {
+
+              // A x B^T
+              ElementA a = tensor_a.at(MatrixCoord(row, k_block));
+              ElementB b_t = tensor_b.at(MatrixCoord(col, k_block));
+
+              ComputeType compute_a(cast_if_scalar<ComputeType>(a));
+              ComputeType compute_b_t(cast_if_scalar<ComputeType>(b_t));
+
+              accum[i][j] = inner_product_op(compute_a, compute_b_t, accum[i][j]);
+
+              // B x A^T
+              ElementB b = tensor_b.at(MatrixCoord(row, k_block));
+              ElementA a_t = tensor_a.at(MatrixCoord(col, k_block));
+
+              ComputeType compute_b(cast_if_scalar<ComputeType>(b));
+              ComputeType compute_a_t(cast_if_scalar<ComputeType>(a_t));
+
+              accum[i][j] = inner_product_op(compute_b, compute_a_t, accum[i][j]);
+            }
+          }
+        }
+      }
+
+      for (int j = 0; j < Nblock; j++) {
+        for (int i = 0; i < Nblock; i++) {
+          int row = row_block + i;
+          int col = col_block + j;
+
+          MatrixCoord coord = MatrixCoord(row, col);
+
+          if (row < N && col < N && 
+              ( (FillModeC == FillMode::kLower && row >= col) || 
+                (FillModeC == FillMode::kUpper && row <= col) )
+          ) {
+            tensor_d.at(coord) = convert_op(
+              alpha * ScalarType(accum[i][j]) +
+              beta * ScalarType(tensor_c.at(coord)));
+          }
+        }
+      }
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Computes a general Rank 2k update (tensors of rank=2) pointed to by TensorRef
+/// objects.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  FillMode FillModeC,
+  typename ScalarType,
+  typename ComputeType,
+  typename InnerProductOp = multiply_add<ComputeType>,
+  typename ConvertOp = NumericConverter<ElementC, ScalarType>
+>
+void compute_rank2k(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRef<ElementA, LayoutA> tensor_a,
+  TensorRef<ElementB, LayoutB> tensor_b,
+  ScalarType beta,
+  TensorRef<ElementC, LayoutC> tensor_c,
+  ComputeType initial_accum) {
+  compute_rank2k<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, FillModeC,
+               ScalarType, ComputeType, InnerProductOp, ConvertOp>(
+      problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_c,
+      initial_accum);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  FillMode FillModeC,
+  typename ScalarType,
+  typename ComputeType,
+  typename InnerProductOp = cutlass::arch::OpMultiplyAdd
+>
+struct Rank2K;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for multiply-add
+template <typename ElementA, typename LayoutA, 
+          typename ElementB, typename LayoutB, 
+          typename ElementC, typename LayoutC, FillMode FillModeC,
+          typename ScalarType, typename ComputeType>
+struct Rank2K<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, FillModeC, ScalarType,
+            ComputeType, arch::OpMultiplyAdd> {
+
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  ComputeType initial_accum = ComputeType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+
+    compute_rank2k<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, FillModeC,
+                 ScalarType, ComputeType, multiply_add<ComputeType>>(
+        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, initial_accum);
+  }
+  
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  TensorRef<ElementC, LayoutC> tensor_d,
+                  ComputeType initial_accum = ComputeType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+
+    compute_rank2k<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, FillModeC,
+                 ScalarType, ComputeType, multiply_add<ComputeType>>(
+        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
+  }
+};
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace host
+} // namespace reference
+} // namespace cutlass
diff --git a/tools/util/include/cutlass/util/reference/host/rank_2k_complex.h b/tools/util/include/cutlass/util/reference/host/rank_2k_complex.h
new file mode 100644
index 00000000..09499111
--- /dev/null
+++ b/tools/util/include/cutlass/util/reference/host/rank_2k_complex.h
@@ -0,0 +1,318 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Reference implementation for complex-valued Rank 2K update in host-side code.
+
+    
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+#include "cutlass/complex.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/gemm/gemm.h"
+#include <assert.h>
+
+namespace cutlass {
+namespace reference {
+namespace host {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+///
+/// Explicitly naming types needed by this template can be cumbersome, particularly for the
+/// accumulator type, so a function argument 'initial_accum' is exposed. Passing
+/// AccumulatorType(0) as the last function argument can be easier than naming all template
+/// arguments explicitly.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename ComputeType,
+  typename ConvertOp = NumericConverter<ElementC, ScalarType>,
+  typename InnerProductOp = multiply_add<ComputeType>
+>
+void Rank2KComplex(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRef<ElementA, LayoutA> tensor_a,
+  ComplexTransform transform_a,
+  TensorRef<ElementB, LayoutB> tensor_b,
+  ComplexTransform transform_b,
+  ScalarType beta,
+  TensorRef<ElementC, LayoutC> tensor_c,
+  TensorRef<ElementC, LayoutC> tensor_d,
+  ComputeType initial_accum,
+  FillMode fill_mode_c,
+  BlasMode blas_mode,
+  int batch_count = 1,
+  int64_t batch_stride_A = 0,
+  int64_t batch_stride_B = 0,
+  int64_t batch_stride_C = 0,
+  int64_t batch_stride_D = 0) {
+
+  static_assert(
+    LayoutA::kRank == 2 &&
+    LayoutB::kRank == 2 &&
+    LayoutC::kRank == 2, "Tensors must be of rank 2");
+
+  // Note: batch is ignored.
+  int const M = problem_size.m();
+  int const N = problem_size.n();
+  int const K = problem_size.k();
+
+  // Rank2K update operates on A=NxK, B=NxK, and C=NxN
+  assert(M==N);
+
+  // Blocking necessary to speedup reference implementation
+  int const Mblock = 16;
+  int const Nblock = 16;
+
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+
+  for (int batch_idx = 0; batch_idx < batch_count; ++batch_idx) {
+
+    // Compute matrix product using blocks
+    for (int row_block = 0; row_block < M; row_block += Mblock) {
+      for (int col_block = 0; col_block < N; col_block += Nblock) {
+
+        ComputeType accum[Mblock][Nblock];
+
+        for (int j = 0; j < Nblock; j++) {
+          for (int i = 0; i < Mblock; i++) {
+            accum[i][j] = initial_accum;
+          }
+        }
+
+        for (int k_block = 0; k_block < K; ++k_block) {
+          for (int j = 0; j < Nblock; j++) {
+            for (int i = 0; i < Mblock; i++) {
+              int row = row_block + i;
+              int col = col_block + j;
+
+              if (row < M && col < N &&
+                 ( (fill_mode_c == FillMode::kLower && row >= col) || 
+                  (fill_mode_c == FillMode::kUpper && row <= col) )               
+                ) {
+                
+                // A x B^T (Symmetric) or A x B^H (Hermitian)
+                // complex conjugation on operandB (b_t) is function of blas3 computation
+                ElementA a = tensor_a.at(MatrixCoord(row, k_block));
+                ElementB b_t = (blas_mode == BlasMode::kHermitian) ? 
+                              conj(tensor_b.at(MatrixCoord(col, k_block))) : 
+                              tensor_b.at(MatrixCoord(col, k_block));
+
+                ComputeType a_ik = ComputeType(a);
+                ComputeType b_jk = ComputeType(b_t);
+
+                // complex conjugation is a function of operand layouts
+                if (transform_a == ComplexTransform::kConjugate) {
+                  a_ik = conj(a_ik);
+                }
+                // complex conjugation is a function of operand layouts
+                if (transform_b == ComplexTransform::kConjugate) {
+                  b_jk = conj(b_jk);
+                }
+
+                accum[i][j] = inner_product_op(a_ik, b_jk,  accum[i][j]);
+              }
+            }
+          }
+        }
+
+        /* HER2K need two epilogues to handle complex alpha value */
+        if ( blas_mode == BlasMode::kHermitian ) {
+          for (int j = 0; j < Nblock; j++) {
+            for (int i = 0; i < Mblock; i++) {
+              int row = row_block + i;
+              int col = col_block + j;
+
+              MatrixCoord coord = MatrixCoord(row, col);
+
+              if (row < M && col < N && 
+                  ((fill_mode_c == FillMode::kLower && row >= col) || 
+                  (fill_mode_c == FillMode::kUpper && row <= col))
+                ) {
+
+                ScalarType c = tensor_c.at(coord);
+                // The imaginary parts of the diagonal elements of 
+                // a complex data type are assumed and set to zero
+                if (blas_mode == BlasMode::kHermitian) {
+                  c = (row == col) ? real(c) : c;
+                }
+
+                tensor_d.at(coord) = convert_op(alpha * 
+                  ScalarType(accum[i][j]) + 
+                  beta * c);
+              }
+            }
+          }
+          
+          /* Zeoring out accum for second HERK */
+          for (int j = 0; j < Nblock; j++) {
+            for (int i = 0; i < Mblock; i++) {
+              accum[i][j] = initial_accum;
+            }
+          }
+        }
+
+        for (int k_block = 0; k_block < K; ++k_block) {
+          for (int j = 0; j < Nblock; j++) {
+            for (int i = 0; i < Mblock; i++) {
+              int row = row_block + i;
+              int col = col_block + j;
+
+              if (row < M && col < N &&
+                 ( (fill_mode_c == FillMode::kLower && row >= col) || 
+                  (fill_mode_c == FillMode::kUpper && row <= col) )               
+                ) {
+
+                // B x A^T (Symmetric) or B x A^H (Hermitian)
+                // complex conjugation on operandB (a_t) is function of blas3 computation
+                ElementB b = tensor_b.at(MatrixCoord(row, k_block));
+                ElementA a_t = (blas_mode == BlasMode::kHermitian) ? 
+                                conj(tensor_a.at(MatrixCoord(col, k_block))):
+                                tensor_a.at(MatrixCoord(col, k_block));
+
+                ComputeType b_ik = ComputeType(b);
+                ComputeType a_jk = ComputeType(a_t);
+                
+                // complex conjugation here is a function of operand layouts
+                if (transform_b == ComplexTransform::kConjugate) {
+                  b_ik = conj(b_ik);
+                }
+                // complex conjugation here is a function of operand layouts
+                if (transform_a == ComplexTransform::kConjugate) {
+                  a_jk = conj(a_jk);
+                }
+
+                accum[i][j] = inner_product_op(b_ik, a_jk, accum[i][j]);
+              }
+            }
+          }
+        }
+
+        ScalarType alpha_hermitian = (blas_mode == BlasMode::kHermitian) ? 
+                                      conj(alpha) : alpha;
+        ScalarType beta_hermitian = (blas_mode == BlasMode::kHermitian) ? 
+                                      1 : beta;
+        
+        for (int j = 0; j < Nblock; j++) {
+          for (int i = 0; i < Mblock; i++) {
+            int row = row_block + i;
+            int col = col_block + j;
+
+            MatrixCoord coord = MatrixCoord(row, col);
+
+            if (row < M && col < N && 
+                ((fill_mode_c == FillMode::kLower && row >= col) || 
+                 (fill_mode_c == FillMode::kUpper && row <= col))
+              ) {
+
+              ScalarType d = (blas_mode == BlasMode::kHermitian) ? 
+                             tensor_d.at(coord) : tensor_c.at(coord);
+
+              ScalarType tmp_d = convert_op(
+                alpha_hermitian * ScalarType(accum[i][j]) + 
+                beta_hermitian * d);
+
+              if (blas_mode == BlasMode::kHermitian && row == col ) {
+                tensor_d.at(coord) = real(tmp_d);
+              } else {
+                tensor_d.at(coord) = tmp_d;
+              }
+            }
+          }
+        }
+
+      } // for (col_block)
+    } // for (row_block)
+
+    tensor_a.add_pointer_offset(batch_stride_A);
+    tensor_b.add_pointer_offset(batch_stride_B);
+    tensor_c.add_pointer_offset(batch_stride_C);
+    tensor_d.add_pointer_offset(batch_stride_D);
+
+  } // for (batch_idx)
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+///
+/// This assumes the accumulator type is the same type as the scalars.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType
+>
+void Rank2KComplex(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRef<ElementA, LayoutA> tensor_a,
+  ComplexTransform transform_a,
+  TensorRef<ElementB, LayoutB> tensor_b,
+  ComplexTransform transform_b,
+  ScalarType beta,
+  TensorRef<ElementC, LayoutC> tensor_c,
+  TensorRef<ElementC, LayoutC> tensor_d,
+  FillMode fill_mode_c,
+  BlasMode blas_mode) {
+
+  Rank2KComplex(
+    problem_size, alpha, 
+    tensor_a, transform_a, 
+    tensor_b, transform_b, 
+    beta, tensor_c, tensor_d, 
+    ScalarType(0),
+    fill_mode_c,
+    blas_mode);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace host
+} // namespace reference
+} // namespace cutlass
diff --git a/tools/util/include/cutlass/util/reference/host/rank_k_complex.h b/tools/util/include/cutlass/util/reference/host/rank_k_complex.h
new file mode 100644
index 00000000..7e6c39cc
--- /dev/null
+++ b/tools/util/include/cutlass/util/reference/host/rank_k_complex.h
@@ -0,0 +1,234 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Reference implementation for complex-valued Rank 2K update in host-side code.
+
+    
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+#include "cutlass/complex.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/gemm/gemm.h"
+#include <assert.h>
+
+namespace cutlass {
+namespace reference {
+namespace host {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+///
+/// Explicitly naming types needed by this template can be cumbersome, particularly for the
+/// accumulator type, so a function argument 'initial_accum' is exposed. Passing
+/// AccumulatorType(0) as the last function argument can be easier than naming all template
+/// arguments explicitly.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename ComputeType,
+  typename ConvertOp = NumericConverter<ElementC, ScalarType>,
+  typename InnerProductOp = multiply_add<ComputeType>
+>
+void Rank2KComplex(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRef<ElementA, LayoutA> tensor_a,
+  ComplexTransform transform_a,
+  ScalarType beta,
+  TensorRef<ElementC, LayoutC> tensor_c,
+  TensorRef<ElementC, LayoutC> tensor_d,
+  ComputeType initial_accum,
+  FillMode fill_mode_c,
+  BlasMode blas_mode,
+  int batch_count = 1,
+  int64_t batch_stride_A = 0,
+  int64_t batch_stride_C = 0,
+  int64_t batch_stride_D = 0) {
+
+  static_assert(
+    LayoutA::kRank == 2 &&
+    LayoutC::kRank == 2, "Tensors must be of rank 2");
+
+  // Note: batch is ignored.
+  int const M = problem_size.m();
+  int const N = problem_size.n();
+  int const K = problem_size.k();
+
+  // Rank2K update operates on A=NxK, B=NxK, and C=NxN
+  assert(M==N);
+
+  // Blocking necessary to speedup reference implementation
+  int const Mblock = 16;
+  int const Nblock = 16;
+
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+
+  for (int batch_idx = 0; batch_idx < batch_count; ++batch_idx) {
+
+    // Compute matrix product using blocks
+    for (int row_block = 0; row_block < M; row_block += Mblock) {
+      for (int col_block = 0; col_block < N; col_block += Nblock) {
+
+        ComputeType accum[Mblock][Nblock];
+
+        for (int j = 0; j < Nblock; j++) {
+          for (int i = 0; i < Mblock; i++) {
+            accum[i][j] = initial_accum;
+          }
+        }
+
+        for (int k_block = 0; k_block < K; ++k_block) {
+          for (int j = 0; j < Nblock; j++) {
+            for (int i = 0; i < Mblock; i++) {
+              int row = row_block + i;
+              int col = col_block + j;
+
+              if (row < M && col < N &&
+                 ( (fill_mode_c == FillMode::kLower && row >= col) || 
+                  (fill_mode_c == FillMode::kUpper && row <= col) )               
+                ) {
+                
+                // A x A^T (Symmetric) or A x A^H (Hermitian)
+                // complex conjugation on operandB (a_t) (function of blas3 computation)
+                ElementA a = tensor_a.at(MatrixCoord(row, k_block));
+                ElementA a_t = (blas_mode == BlasMode::kHermitian) ? 
+                              conj(tensor_a.at(MatrixCoord(col, k_block))) : 
+                              tensor_a.at(MatrixCoord(col, k_block));
+
+                ComputeType a_ik = ComputeType(a);
+                ComputeType b_jk = ComputeType(a_t);
+
+                // complex conjugation (function of input layouts)
+                if (transform_a == ComplexTransform::kConjugate) {
+                  a_ik = conj(a_ik);
+                }
+                // complex conjugation (function of input layouts)
+                if (transform_a == ComplexTransform::kConjugate) {
+                  b_jk = conj(b_jk);
+                }
+
+                accum[i][j] = inner_product_op(a_ik, b_jk,  accum[i][j]);
+
+              }
+            }
+          }
+        }
+
+        for (int j = 0; j < Nblock; j++) {
+          for (int i = 0; i < Mblock; i++) {
+            int row = row_block + i;
+            int col = col_block + j;
+
+            MatrixCoord coord = MatrixCoord(row, col);
+
+            if (row < M && col < N && 
+                ((fill_mode_c == FillMode::kLower && row >= col) || 
+                 (fill_mode_c == FillMode::kUpper && row <= col))
+              ) {
+
+              ScalarType c = tensor_c.at(coord);
+              // The imaginary parts of the diagonal elements of 
+              // a complex data type are assumed and set to zero
+              if (blas_mode == BlasMode::kHermitian) {
+                c = (row == col) ? real(c) : c;
+              }
+
+              ScalarType tmp_d = convert_op(
+                alpha * ScalarType(accum[i][j]) + 
+                beta * c);
+
+              if (blas_mode == BlasMode::kHermitian && row == col ) {
+                tensor_d.at(coord) = real(tmp_d);
+              } else {
+                tensor_d.at(coord) = tmp_d;
+              }
+            }
+          }
+        }
+
+      } // for (col_block)
+    } // for (row_block)
+
+    tensor_a.add_pointer_offset(batch_stride_A);
+    tensor_c.add_pointer_offset(batch_stride_C);
+    tensor_d.add_pointer_offset(batch_stride_D);
+
+  } // for (batch_idx)
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+///
+/// This assumes the accumulator type is the same type as the scalars.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType
+>
+void RankKComplex(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRef<ElementA, LayoutA> tensor_a,
+  ComplexTransform transform_a,
+  ScalarType beta,
+  TensorRef<ElementC, LayoutC> tensor_c,
+  TensorRef<ElementC, LayoutC> tensor_d,
+  FillMode fill_mode_c,
+  BlasMode blas_mode) {
+
+  Rank2KComplex(
+    problem_size, alpha, 
+    tensor_a, transform_a, 
+    beta, tensor_c, tensor_d, 
+    ScalarType(0),
+    fill_mode_c,
+    blas_mode);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace host
+} // namespace reference
+} // namespace cutlass
diff --git a/tools/util/include/cutlass/util/reference/host/symm.h b/tools/util/include/cutlass/util/reference/host/symm.h
new file mode 100644
index 00000000..d30e9a94
--- /dev/null
+++ b/tools/util/include/cutlass/util/reference/host/symm.h
@@ -0,0 +1,285 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Reference implementation for SYMM update in host-side code.
+    
+    
+
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+#include "cutlass/numeric_conversion.h"
+
+#include "cutlass/tensor_view.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/arch/mma.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+
+namespace cutlass {
+namespace reference {
+namespace host {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+template <
+  typename ElementA,
+  typename LayoutA,
+  SideMode SideModeA,
+  FillMode FillModeA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename ComputeType,
+  typename InnerProductOp = multiply_add<ComputeType>,
+  typename ConvertOp = NumericConverter<ElementC, ScalarType>
+>
+void compute_symm(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRef<ElementA, LayoutA> tensor_a,
+  TensorRef<ElementB, LayoutB> tensor_b,
+  ScalarType beta,
+  TensorRef<ElementC, LayoutC> tensor_c,
+  TensorRef<ElementC, LayoutC> tensor_d,
+  ComputeType initial_accum) {
+
+  static_assert(
+    LayoutA::kRank == 2 &&
+    LayoutB::kRank == 2 &&
+    LayoutC::kRank == 2, 
+    "Tensors must be of rank 2");
+
+  static_assert(SideModeA != SideMode::kInvalid
+                , "Side Mode can either be Left or Right.");
+
+  static_assert(
+    FillModeA == FillMode::kLower || 
+    FillModeA == FillMode::kUpper, 
+    "Fill Mode can either be Lower or Upper.");
+
+  using CompareOp_w_diag =  typename TrMatrixCompareOp<FillModeA, DiagType::kNonUnit>::Type;
+  using CompareOp_wo_diag = typename TrMatrixCompareOp<FillModeA, DiagType::kZero>::Type;
+
+  // Note: batch is ignored.
+  int const M = problem_size.m();
+  int const N = problem_size.n();
+  // Assuming correct k-dimension value is passed
+  int const K = problem_size.k();
+
+  // Blocking necessary to speedup reference implementation
+  int const Mblock = 16;
+  int const Nblock = 16;
+
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+  CompareOp_w_diag compare_op_1;
+  CompareOp_wo_diag compare_op_2;
+
+  for (int row_block = 0; row_block < M; row_block += Mblock) {
+    for (int col_block = 0; col_block < N; col_block += Nblock) {
+
+      ComputeType accum[Mblock][Nblock];
+
+      for (int j = 0; j < Nblock; j++) {
+        for (int i = 0; i < Mblock; i++) {
+          accum[i][j] = initial_accum;
+        }
+      }
+
+      for (int k_block = 0; k_block < K; ++k_block) {
+        for (int j = 0; j < Nblock; j++) {
+          for (int i = 0; i < Mblock; i++) {
+            int row = row_block + i;
+            int col = col_block + j;
+
+            if (row < M && col < N) {
+              ElementA a_1 = ElementA();
+              ElementB b_1 = ElementB();
+              ElementA a_2 = ElementA();
+              ElementB b_2 = ElementB();
+
+              // A x B or B x A (with diagonal)
+              if (SideModeA == SideMode::kLeft) {
+                a_1 = (compare_op_1(row, k_block)) ? 
+                      (tensor_a.at(MatrixCoord(row, k_block))) : ElementA();
+                b_1 = tensor_b.at(MatrixCoord(k_block, col));
+              } else if (SideModeA == SideMode::kRight) {
+                a_1 = tensor_b.at(MatrixCoord(row, k_block));
+                b_1 = (compare_op_1(k_block, col)) ? 
+                      tensor_a.at(MatrixCoord(k_block, col)) : ElementA();
+              }
+
+              ComputeType compute_a_1(cast_if_scalar<ComputeType>(a_1));
+              ComputeType compute_b_1(cast_if_scalar<ComputeType>(b_1));
+
+              accum[i][j] = inner_product_op(compute_a_1, compute_b_1, accum[i][j]);
+
+              // A^T x B or B x A^T (without diagonal)
+              if (SideModeA == SideMode::kLeft) {
+                a_2 = (compare_op_2(k_block, row)) ? 
+                      (tensor_a.at(MatrixCoord(k_block, row))) : ElementA();
+                b_2 = tensor_b.at(MatrixCoord(k_block, col));
+              } else if (SideModeA == SideMode::kRight) {
+                a_2 = tensor_b.at(MatrixCoord(row, k_block));
+                b_2 = (compare_op_2(col, k_block)) ? 
+                      tensor_a.at(MatrixCoord(col, k_block)) : ElementA();
+              }
+
+              ComputeType compute_a_2(cast_if_scalar<ComputeType>(a_2));
+              ComputeType compute_b_2(cast_if_scalar<ComputeType>(b_2));
+
+              accum[i][j] = inner_product_op(compute_a_2, compute_b_2, accum[i][j]);
+            }
+          }
+        }
+      }
+
+      for (int j = 0; j < Nblock; j++) {
+        for (int i = 0; i < Mblock; i++) {
+          int row = row_block + i;
+          int col = col_block + j;
+
+          MatrixCoord coord = MatrixCoord(row, col);
+
+          if (row < M && col < N) {
+            tensor_d.at(coord) = convert_op(
+              alpha * ScalarType(accum[i][j]) +
+              beta * ScalarType(tensor_c.at(coord)));
+          }
+        }
+      }
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Computes a general Symm update (tensors of rank=2) pointed to by TensorRef
+/// objects.
+template <
+  typename ElementA,
+  typename LayoutA,
+  SideMode SideModeA,
+  FillMode FillModeA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename ComputeType,
+  typename InnerProductOp = multiply_add<ComputeType>,
+  typename ConvertOp = NumericConverter<ElementC, ScalarType>
+>
+void compute_symm(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRef<ElementA, LayoutA> tensor_a,
+  TensorRef<ElementB, LayoutB> tensor_b,
+  ScalarType beta,
+  TensorRef<ElementC, LayoutC> tensor_c,
+  ComputeType initial_accum) {
+  compute_symm<ElementA, LayoutA, SideModeA, FillModeA, ElementB, LayoutB, ElementC, LayoutC,
+               ScalarType, ComputeType, InnerProductOp, ConvertOp>(
+      problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_c,
+      initial_accum);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  SideMode SideModeA,
+  FillMode FillModeA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename ComputeType,
+  typename InnerProductOp = cutlass::arch::OpMultiplyAdd
+>
+struct Symm;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for multiply-add
+template <typename ElementA, typename LayoutA, 
+          SideMode SideModeA, FillMode FillModeA,
+          typename ElementB, typename LayoutB, 
+          typename ElementC, typename LayoutC,
+          typename ScalarType, typename ComputeType>
+struct Symm<ElementA, LayoutA, SideModeA, FillModeA, ElementB, LayoutB, ElementC, LayoutC, ScalarType,
+            ComputeType, arch::OpMultiplyAdd> {
+
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  ComputeType initial_accum = ComputeType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+
+    compute_symm<ElementA, LayoutA, SideModeA, FillModeA, ElementB, LayoutB, ElementC, LayoutC,
+                 ScalarType, ComputeType, multiply_add<ComputeType>>(
+        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, initial_accum);
+  }
+  
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  TensorRef<ElementC, LayoutC> tensor_d,
+                  ComputeType initial_accum = ComputeType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+
+    compute_symm<ElementA, LayoutA, SideModeA, FillModeA, ElementB, LayoutB, ElementC, LayoutC,
+                 ScalarType, ComputeType, multiply_add<ComputeType>>(
+        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
+  }
+};
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace host
+} // namespace reference
+} // namespace cutlass
diff --git a/tools/util/include/cutlass/util/reference/host/symm_complex.h b/tools/util/include/cutlass/util/reference/host/symm_complex.h
new file mode 100644
index 00000000..acb35aa1
--- /dev/null
+++ b/tools/util/include/cutlass/util/reference/host/symm_complex.h
@@ -0,0 +1,319 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Reference implementation for complex-valued SYMM update in host-side code.
+
+    
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+#include "cutlass/complex.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/gemm/gemm.h"
+#include <assert.h>
+
+namespace cutlass {
+namespace reference {
+namespace host {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
+/// objects.
+///
+/// Explicitly naming types needed by this template can be cumbersome, particularly for the
+/// accumulator type, so a function argument 'initial_accum' is exposed. Passing
+/// AccumulatorType(0) as the last function argument can be easier than naming all template
+/// arguments explicitly.
+template <
+  typename ElementA,
+  typename LayoutA,
+  SideMode SideModeA,
+  FillMode FillModeA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename ComputeType,
+  BlasMode BlasMode_ = BlasMode::kSymmetric,
+  typename InnerProductOp = multiply_add<ComputeType>,
+  typename ConvertOp = NumericConverter<ElementC, ScalarType>
+>
+void compute_symm_complex(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRef<ElementA, LayoutA> tensor_a,
+  TensorRef<ElementB, LayoutB> tensor_b,
+  ScalarType beta,
+  TensorRef<ElementC, LayoutC> tensor_c,
+  TensorRef<ElementC, LayoutC> tensor_d,
+  ComputeType initial_accum,
+  int batch_count = 1,
+  int64_t batch_stride_A = 0,
+  int64_t batch_stride_B = 0,
+  int64_t batch_stride_C = 0,
+  int64_t batch_stride_D = 0) {
+  
+  static SideMode const kSideModeA = SideModeA;
+  static FillMode const kFillModeA = FillModeA;
+  static BlasMode const kBlasMode  = BlasMode_;
+
+  static_assert(
+    LayoutA::kRank == 2 &&
+    LayoutB::kRank == 2 &&
+    LayoutC::kRank == 2, "Tensors must be of rank 2");
+
+  static_assert(kSideModeA != SideMode::kInvalid
+                , "Side Mode can either be Left or Right.");
+
+  static_assert(
+    kFillModeA == FillMode::kLower || 
+    kFillModeA == FillMode::kUpper, 
+    "Fill Mode can either be Lower or Upper.");
+
+  using CompareOp_w_diag =  typename TrMatrixCompareOp<kFillModeA, DiagType::kNonUnit>::Type;
+  using CompareOp_wo_diag = typename TrMatrixCompareOp<kFillModeA, DiagType::kZero>::Type;
+
+  // Note: batch is ignored.
+  int const M = problem_size.m();
+  int const N = problem_size.n();
+  // Assuming correct k-dimension value is passed
+  int const K = problem_size.k();
+
+  // Blocking necessary to speedup reference implementation
+  int const Mblock = 16;
+  int const Nblock = 16;
+
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+  CompareOp_w_diag compare_op_1;
+  CompareOp_wo_diag compare_op_2;
+
+  for (int batch_idx = 0; batch_idx < batch_count; ++batch_idx) {
+
+    // Compute matrix product using blocks
+    for (int row_block = 0; row_block < M; row_block += Mblock) {
+      for (int col_block = 0; col_block < N; col_block += Nblock) {
+
+        ComputeType accum[Mblock][Nblock];
+
+        for (int j = 0; j < Nblock; j++) {
+          for (int i = 0; i < Mblock; i++) {
+            accum[i][j] = initial_accum;
+          }
+        }
+
+        for (int k_block = 0; k_block < K; ++k_block) {
+          for (int j = 0; j < Nblock; j++) {
+            for (int i = 0; i < Mblock; i++) {
+              int row = row_block + i;
+              int col = col_block + j;
+
+              if (row < M && col < N) 
+              {
+                ElementA a_1 = ElementA();
+                ElementB b_1 = ElementB();
+                ElementA a_2 = ElementA();
+                ElementB b_2 = ElementB();
+                
+                // A x B or B x A (with diagonal)
+                if (kSideModeA == SideMode::kLeft) {
+                  a_1 = (compare_op_1(row, k_block)) ? 
+                        (tensor_a.at(MatrixCoord(row, k_block))) : ElementA();
+                  b_1 = tensor_b.at(MatrixCoord(k_block, col));
+                } else if (kSideModeA == SideMode::kRight) {
+                  a_1 = tensor_b.at(MatrixCoord(row, k_block));
+                  b_1 = (compare_op_1(k_block, col)) ? 
+                        tensor_a.at(MatrixCoord(k_block, col)) : ElementA();
+                }
+                ComputeType compute_a_1 = ComputeType(a_1);
+                ComputeType compute_b_1 = ComputeType(b_1);
+
+                // The imaginary parts of the diagonal elements of 
+                // a complex data type are assumed and set to zero
+                if (kBlasMode == BlasMode::kHermitian && kSideModeA == SideMode::kLeft && row == k_block) {
+                  compute_a_1 = real(compute_a_1);
+                } else if (kBlasMode == BlasMode::kHermitian && kSideModeA == SideMode::kRight && k_block == col) {
+                  compute_b_1 = real(compute_b_1);
+                }
+
+                accum[i][j] = inner_product_op(compute_a_1, compute_b_1,  accum[i][j]);
+
+                // A^T x B or B x A^T (without diagonal)
+                if (kSideModeA == SideMode::kLeft) {
+                  a_2 = (compare_op_2(k_block, row)) ? 
+                        (tensor_a.at(MatrixCoord(k_block, row))) : ElementA();
+                  b_2 = tensor_b.at(MatrixCoord(k_block, col));
+                  if (kBlasMode == BlasMode::kHermitian)
+                    a_2 = conj(a_2);
+                } else if (kSideModeA == SideMode::kRight) {
+                  a_2 = tensor_b.at(MatrixCoord(row, k_block));
+                  b_2 = (compare_op_2(col, k_block)) ? 
+                        tensor_a.at(MatrixCoord(col, k_block)) : ElementA();
+                  if (kBlasMode == BlasMode::kHermitian)
+                    b_2 = conj(b_2);
+                }
+
+                ComputeType compute_a_2 = ComputeType(a_2);
+                ComputeType compute_b_2 = ComputeType(b_2);
+
+                accum[i][j] = inner_product_op(compute_a_2, compute_b_2, accum[i][j]);
+              }
+            }
+          }
+        }
+
+        for (int j = 0; j < Nblock; j++) {
+          for (int i = 0; i < Mblock; i++) {
+            int row = row_block + i;
+            int col = col_block + j;
+
+            MatrixCoord coord = MatrixCoord(row, col);
+
+            if (row < M && col < N) {
+
+              ScalarType c = tensor_c.at(coord);
+
+              tensor_d.at(coord) = convert_op(
+                alpha * ScalarType(accum[i][j]) + 
+                beta * c);
+            }
+          }
+        }
+
+      } // for (col_block)
+    } // for (row_block)
+
+    tensor_a.add_pointer_offset(batch_stride_A);
+    tensor_b.add_pointer_offset(batch_stride_B);
+    tensor_c.add_pointer_offset(batch_stride_C);
+    tensor_d.add_pointer_offset(batch_stride_D);
+
+  } // for (batch_idx)
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  SideMode SideModeA,
+  FillMode FillModeA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename ComputeType,
+  BlasMode BlasMode_ = cutlass::BlasMode::kSymmetric,
+  typename InnerProductOp = cutlass::arch::OpMultiplyAddComplex
+>
+struct SymmComplex;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for multiply-add
+template <typename ElementA, typename LayoutA,
+          SideMode SideModeA, FillMode FillModeA, 
+          typename ElementB, typename LayoutB,
+          typename ElementC, typename LayoutC,
+          typename ScalarType, typename ComputeType,
+          BlasMode BlasMode_>
+struct SymmComplex<ElementA, LayoutA, 
+                   SideModeA, FillModeA,
+                   ElementB, LayoutB,
+                   ElementC, LayoutC, ScalarType,
+                   ComputeType, BlasMode_,
+                   arch::OpMultiplyAddComplex> {
+
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  TensorRef<ElementC, LayoutC> tensor_d,
+                  ComputeType initial_accum = ComputeType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+
+    compute_symm_complex<ElementA, LayoutA,
+                 SideModeA, FillModeA,
+                 ElementB, LayoutB,
+                 ElementC, LayoutC, 
+                 ScalarType, ComputeType, BlasMode_, multiply_add<ComputeType>>(
+                 problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for gaussian multiply-add 
+template <typename ElementA, typename LayoutA,
+          SideMode SideModeA, FillMode FillModeA,
+          typename ElementB, typename LayoutB,
+          typename ElementC, typename LayoutC,
+          typename ScalarType, typename ComputeType,
+          BlasMode BlasMode_>
+struct SymmComplex<ElementA, LayoutA, 
+                   SideModeA, FillModeA, 
+                   ElementB, LayoutB,
+                   ElementC, LayoutC, ScalarType,
+                   ComputeType, BlasMode_,
+                   arch::OpMultiplyAddGaussianComplex> {
+
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
+                  TensorRef<ElementC, LayoutC> tensor_c,
+                  TensorRef<ElementC, LayoutC> tensor_d,
+                  ComputeType initial_accum = ComputeType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+
+    compute_symm_complex<ElementA, LayoutA,
+                 SideModeA, FillModeA,
+                 ElementB, LayoutB,
+                 ElementC, LayoutC, 
+                 ScalarType, ComputeType, BlasMode_, multiply_add<ComputeType>>(
+                 problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace host
+} // namespace reference
+} // namespace cutlass
diff --git a/tools/util/include/cutlass/util/reference/host/tensor_compare.h b/tools/util/include/cutlass/util/reference/host/tensor_compare.h
index 9dbd6b21..794e21d8 100644
--- a/tools/util/include/cutlass/util/reference/host/tensor_compare.h
+++ b/tools/util/include/cutlass/util/reference/host/tensor_compare.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/util/include/cutlass/util/reference/host/tensor_copy.h b/tools/util/include/cutlass/util/reference/host/tensor_copy.h
index 6ace6623..86b55480 100644
--- a/tools/util/include/cutlass/util/reference/host/tensor_copy.h
+++ b/tools/util/include/cutlass/util/reference/host/tensor_copy.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/util/include/cutlass/util/reference/host/tensor_elementwise.h b/tools/util/include/cutlass/util/reference/host/tensor_elementwise.h
index bdc2bf24..17d008ec 100644
--- a/tools/util/include/cutlass/util/reference/host/tensor_elementwise.h
+++ b/tools/util/include/cutlass/util/reference/host/tensor_elementwise.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/util/include/cutlass/util/reference/host/tensor_fill.h b/tools/util/include/cutlass/util/reference/host/tensor_fill.h
index 0bc6c57c..d2e29579 100644
--- a/tools/util/include/cutlass/util/reference/host/tensor_fill.h
+++ b/tools/util/include/cutlass/util/reference/host/tensor_fill.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
@@ -42,6 +48,8 @@
 #include "cutlass/subbyte_reference.h"
 #include "cutlass/tensor_view.h"
 #include "cutlass/tensor_view_planar_complex.h"
+#include "cutlass/blas3.h"
+
 #include "cutlass/util/distribution.h"
 #include "tensor_foreach.h"
 
@@ -303,6 +311,51 @@ struct TensorFillGaussianFunc {
   }
 };
 
+/// Computes a random Gaussian distribution
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorFillSymmetricGaussianFunc {
+
+  using TensorView = TensorView<Element, Layout>;
+
+  //
+  // Data members
+  //
+
+  TensorView view;
+  RandomGaussianFunc<Element> func;
+  cutlass::FillMode fill_mode;
+
+  //
+  // Methods
+  //
+
+  /// Construction of Gaussian RNG functor.
+  TensorFillSymmetricGaussianFunc(
+    TensorView view_ = TensorView(),
+    RandomGaussianFunc<Element> func_ = RandomGaussianFunc<Element>(),
+    cutlass::FillMode fill_mode_ = cutlass::FillMode::kInvalid
+  ):
+    view(view_), func(func_), fill_mode(fill_mode_) {
+
+  }
+
+  /// Compute random value and update RNG state
+  void operator()(Coord<Layout::kRank> const &coord) const {
+    // Fill half of matrix based on FillMode
+    if (Layout::kRank == 2 && 
+        fill_mode == cutlass::FillMode::kLower &&
+        coord[0] >= coord[1]) {
+      view.at(coord) = func();
+    } else if (Layout::kRank == 2 && 
+        fill_mode == cutlass::FillMode::kUpper &&
+        coord[0] <= coord[1]) {
+      view.at(coord) = func();
+    }
+  }
+};
+
 } // namespace detail
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
@@ -351,6 +404,36 @@ void TensorFillRandomGaussian(
 }
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Fills a tensor with random values with a Gaussian distribution.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorFillSymmetricRandomGaussian(
+  TensorView<Element, Layout> dst,        ///< destination tensor
+  uint64_t seed,                          ///< seed for RNG
+  cutlass::FillMode fill_mode,            ///< FillMode for symmetric matrices
+  double mean = 0,                        ///< Gaussian distribution's mean
+  double stddev = 1,                      ///< Gaussian distribution's standard deviation
+  int bits = -1) {                        ///< If non-negative, specifies number of fractional bits that 
+                                          ///  are not truncated to zero. Permits reducing precision of
+                                          ///  data.
+
+  detail::RandomGaussianFunc<Element> random_func(seed, mean, stddev, bits);
+
+  detail::TensorFillSymmetricGaussianFunc<Element, Layout> func(
+    dst,
+    random_func,
+    fill_mode
+  );
+
+  TensorForEach(
+    dst.extent(),
+    func
+  );
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
 /// Fills a tensor with random values with a Gaussian distribution.
 template <
   typename Element                        ///< Element type
@@ -566,6 +649,104 @@ struct TensorFillRandomUniformFunc {
   }
 };
 
+/// Computes a random Gaussian distribution
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorFillSymmetricRandomUniformFunc {
+
+  using TensorView = TensorView<Element, Layout>;
+
+  //
+  // Data members
+  //
+
+  TensorView view;
+  RandomUniformFunc<Element> func;
+  cutlass::FillMode fill_mode;
+
+  //
+  // Methods
+  //
+
+  /// Construction of Gaussian RNG functor.
+  TensorFillSymmetricRandomUniformFunc(
+    TensorView view_ = TensorView(),
+    RandomUniformFunc<Element> func_ = RandomUniformFunc<Element>(),
+    cutlass::FillMode fill_mode_ = cutlass::FillMode::kInvalid
+  ):
+    view(view_), func(func_), fill_mode(fill_mode_) {
+
+  }
+
+  /// Compute random value and update RNG state
+  void operator()(Coord<Layout::kRank> const &coord) const {
+    // Fill half of matrix based on FillMode
+    if (Layout::kRank == 2 && 
+        fill_mode == cutlass::FillMode::kLower &&
+        coord[0] >= coord[1]) {
+      view.at(coord) = func();
+    } else if (Layout::kRank == 2 && 
+        fill_mode == cutlass::FillMode::kUpper &&
+        coord[0] <= coord[1]) {
+      view.at(coord) = func();
+    }
+  }
+};
+
+
+//
+// We expect to release this with CUTLASS 2.4. -akerr
+
+/// Computes a random Uniform distribution and pads diagonal with zeros
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+struct TensorFillPadDiagonalRandomUniformFunc {
+
+  using TensorView = TensorView<Element, Layout>;
+
+  //
+  // Data members
+  //
+
+  TensorView view;
+  RandomUniformFunc<Element> func;
+  cutlass::FillMode fill_mode;
+  int alignment;
+
+  //
+  // Methods
+  //
+
+  /// Construction of Gaussian RNG functor.
+  TensorFillPadDiagonalRandomUniformFunc(
+    TensorView view_ = TensorView(),
+    RandomUniformFunc<Element> func_ = RandomUniformFunc<Element>(),
+    cutlass::FillMode fill_mode_ = cutlass::FillMode::kInvalid,
+    int alignment_ = 1
+  ):
+    view(view_), func(func_), fill_mode(fill_mode_), alignment(alignment_) {
+
+  }
+
+  /// Compute random value and update RNG state
+  void operator()(Coord<Layout::kRank> const &coord) const {
+    // Fill half of matrix based on FillMode
+    if (Layout::kRank == 2 && 
+        (fill_mode == cutlass::FillMode::kLower) &&
+        (coord[0] >= coord[1]) || 
+        ((coord[1] - coord[0]) >= alignment)) {
+      view.at(coord) = func();
+    } else if (Layout::kRank == 2 && 
+        fill_mode == cutlass::FillMode::kUpper &&
+        (coord[0] <= coord[1]) ||
+        ((coord[0] - coord[1]) >= alignment)) {
+      view.at(coord) = func();
+    }
+  }
+};
+
 } // namespace detail
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
@@ -639,6 +820,69 @@ void TensorFillRandomUniform(
 }
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Fills a tensor with random values with a uniform random distribution.
+template <
+  typename Element,               ///< Element type
+  typename Layout>                ///< Layout function
+void TensorFillSymmetricRandomUniform(
+  TensorView<Element, Layout> dst,        ///< destination tensor
+  uint64_t seed,                          ///< seed for RNG
+  cutlass::FillMode fill_mode,            ///< FillMode for symmetric matrices
+  double max = 1,                         ///< upper bound of distribution
+  double min = 0,                         ///< lower bound for distribution
+  int bits = -1) {                        ///< If non-negative, specifies number of fractional bits that 
+                                          ///  are not truncated to zero. Permits reducing precision of
+                                          ///  data.
+
+  detail::RandomUniformFunc<Element> random_func(seed, max, min, bits);
+
+  detail::TensorFillSymmetricRandomUniformFunc<Element, Layout> func(
+    dst,
+    random_func,
+    fill_mode
+  );
+
+  TensorForEach(
+    dst.extent(),
+    func
+  );
+}
+
+/// Fills a tensor with random values with a uniform random distribution pads zeros along diagonal
+template <
+  typename Element,                       ///< Element type
+  typename Layout>                        ///< Layout function
+void TensorFillPadDiagonalRandomUniform(
+  TensorView<Element, Layout> dst,        ///< destination tensor
+  uint64_t seed,                          ///< seed for RNG
+  cutlass::FillMode fill_mode,            ///< FillMode for symmetric matrices
+  double max = 1,                         ///< upper bound of distribution
+  double min = 0,                         ///< lower bound for distribution
+  int bits = -1,                          ///< If non-negative, specifies number of fractional bits that 
+                                          ///  are not truncated to zero. Permits reducing precision of
+                                          ///  data.
+  int alignment = 1 
+) {
+
+  detail::RandomUniformFunc<Element> random_func(seed, max, min, bits);
+
+  detail::TensorFillPadDiagonalRandomUniformFunc<Element, Layout> func(
+    dst,
+    random_func,
+    fill_mode,
+    alignment
+  );
+
+  TensorForEach(
+    dst.extent(),
+    func
+  );
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
 /// Fills a tensor with random values with a uniform random distribution.
 template <
   typename Element                        ///< Element type
diff --git a/tools/util/include/cutlass/util/reference/host/tensor_foreach.h b/tools/util/include/cutlass/util/reference/host/tensor_foreach.h
index ed429502..08491c7e 100644
--- a/tools/util/include/cutlass/util/reference/host/tensor_foreach.h
+++ b/tools/util/include/cutlass/util/reference/host/tensor_foreach.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/util/include/cutlass/util/reference/host/tensor_norm.h b/tools/util/include/cutlass/util/reference/host/tensor_norm.h
index 2b3171b6..93a05020 100644
--- a/tools/util/include/cutlass/util/reference/host/tensor_norm.h
+++ b/tools/util/include/cutlass/util/reference/host/tensor_norm.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/util/include/cutlass/util/reference/host/tensor_reduce.h b/tools/util/include/cutlass/util/reference/host/tensor_reduce.h
index 6710a813..1da7cbea 100644
--- a/tools/util/include/cutlass/util/reference/host/tensor_reduce.h
+++ b/tools/util/include/cutlass/util/reference/host/tensor_reduce.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
diff --git a/tools/util/include/cutlass/util/reference/host/trmm.h b/tools/util/include/cutlass/util/reference/host/trmm.h
new file mode 100644
index 00000000..0e2337c1
--- /dev/null
+++ b/tools/util/include/cutlass/util/reference/host/trmm.h
@@ -0,0 +1,215 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Reference implementation for TRMM in host-side code.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/arch/mma.h"
+#include "cutlass/util/host_tensor.h"
+
+#include "cutlass/util/reference/host/gemm.h"
+
+namespace cutlass {
+namespace reference {
+namespace host {
+
+/// Computes a Triangular Matrix Multiplication (tensors of rank=2) pointed to by TensorRef
+/// objects.
+template <
+  typename ElementA,
+  typename LayoutA,
+  SideMode SideModeA,
+  FillMode FillModeA,
+  DiagType DiagTypeA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename ComputeType,
+  typename InnerProductOp = multiply_add<ComputeType>,
+  typename ConvertOp = NumericConverter<ElementC, ScalarType>
+>
+void compute_trmm(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRef<ElementA, LayoutA> tensor_a,
+  TensorRef<ElementB, LayoutB> tensor_b,
+  TensorRef<ElementC, LayoutC> tensor_d,
+  ComputeType initial_accum) {
+
+  static_assert(
+    LayoutA::kRank == 2 &&
+    LayoutC::kRank == 2, "Tensors must be of rank 2");
+
+  static_assert(SideModeA != SideMode::kInvalid
+                , "Side Mode can either be Left or Right.");
+
+  static_assert(FillModeA == FillMode::kLower || FillModeA == FillMode::kUpper
+                , "Fill Mode can either be Lower or Upper.");
+
+  using CompareOp = typename TrMatrixCompareOp<FillModeA, DiagTypeA>::Type;
+
+  // Note: batch is ignored.
+  int const M = problem_size.m();
+  int const N = problem_size.n();
+  // Assuming correct k-dimension value is passed
+  int const K = problem_size.k();
+ 
+  // Blocking necessary to speedup reference implementation
+  int const Mblock = 16;
+  int const Nblock = 16;
+
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+  CompareOp compare_op;
+
+  for (int row_block = 0; row_block < M; row_block += Mblock) {
+    for (int col_block = 0; col_block < N; col_block += Nblock) {
+
+      ComputeType accum[Mblock][Nblock];
+
+      for (int j = 0; j < Nblock; j++) {
+        for (int i = 0; i < Mblock; i++) {
+          accum[i][j] = initial_accum;
+        }
+      }
+
+      for (int k_block = 0; k_block < K; ++k_block) {
+        for (int j = 0; j < Nblock; j++) {
+          for (int i = 0; i < Mblock; i++) {
+            int row = row_block + i;
+            int col = col_block + j;
+
+            if (row < M && col < N) {
+              ElementA a = ElementA();
+              ElementB b = ElementB();
+
+              if (SideModeA == SideMode::kLeft) {
+                a = (compare_op(row, k_block)) ? 
+                            (tensor_a.at(MatrixCoord(row, k_block))) : ElementA(0);
+                if (row == k_block && DiagTypeA == DiagType::kUnit) {
+                  a = ElementA(1);
+                }
+                b = tensor_b.at(MatrixCoord(k_block, col));
+              } else if (SideModeA == SideMode::kRight) {
+                a = tensor_b.at(MatrixCoord(row, k_block));
+                b = (compare_op(k_block, col)) ? 
+                      tensor_a.at(MatrixCoord(k_block, col)) : ElementA(0);
+                if (k_block == col && DiagTypeA == DiagType::kUnit) {
+                  b = ElementA(1);
+                }
+              }
+                            
+              ComputeType compute_a(cast_if_scalar<ComputeType>(a));
+              ComputeType compute_b(cast_if_scalar<ComputeType>(b));
+
+              accum[i][j] = inner_product_op(compute_a, compute_b, accum[i][j]);
+            }
+          }
+        }
+      }
+
+      for (int j = 0; j < Nblock; j++) {
+        for (int i = 0; i < Mblock; i++) {
+          int row = row_block + i;
+          int col = col_block + j;
+
+          MatrixCoord coord = MatrixCoord(row, col);
+
+          if (row < M && col < N) {
+            tensor_d.at(coord) = convert_op(
+              alpha * ScalarType(accum[i][j]));
+          }
+        }
+      }
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  SideMode SideModeA,
+  FillMode FillModeA,
+  DiagType DiagTypeA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename ComputeType,
+  typename InnerProductOp = cutlass::arch::OpMultiplyAdd
+>
+struct Trmm;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for multiply-add
+template <typename ElementA, typename LayoutA, SideMode SideModeA,
+           FillMode FillModeA, DiagType DiagTypeA, 
+           typename ElementB, typename LayoutB,
+           typename ElementC, typename LayoutC,
+          typename ScalarType, typename ComputeType>
+struct Trmm<ElementA, LayoutA, SideModeA, FillModeA, DiagTypeA, ElementB, LayoutB,
+            ElementC, LayoutC, ScalarType,
+            ComputeType, arch::OpMultiplyAdd> {
+
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b,
+                  TensorRef<ElementC, LayoutC> tensor_d,
+                  ComputeType initial_accum = ComputeType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+
+    compute_trmm<ElementA, LayoutA, SideModeA, FillModeA, DiagTypeA, ElementB, LayoutB,
+                 ElementC, LayoutC, ScalarType, ComputeType, multiply_add<ComputeType>>(
+                 problem_size, alpha, tensor_a, tensor_b, tensor_d, initial_accum);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace host
+} // namespace reference
+} // namespace cutlass
diff --git a/tools/util/include/cutlass/util/reference/host/trmm_complex.h b/tools/util/include/cutlass/util/reference/host/trmm_complex.h
new file mode 100644
index 00000000..33beaf59
--- /dev/null
+++ b/tools/util/include/cutlass/util/reference/host/trmm_complex.h
@@ -0,0 +1,262 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Reference implementation for complex-valued TRMM in host-side code.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+#include "cutlass/complex.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/util/reference/host/gemm.h"
+
+namespace cutlass {
+namespace reference {
+namespace host {
+
+/// Computes a Triangular Matrix Multiplication (tensors of rank=2) pointed to by TensorRef
+/// objects.
+template <
+  typename ElementA,
+  typename LayoutA,
+  ComplexTransform TransformA,
+  SideMode SideModeA,
+  FillMode FillModeA,
+  DiagType DiagTypeA,
+  typename ElementB,
+  typename LayoutB,
+  ComplexTransform TransformB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename ComputeType,
+  typename InnerProductOp = multiply_add<ComputeType>,
+  typename ConvertOp = NumericConverter<ElementC, ScalarType>
+>
+void compute_trmm_complex(
+  gemm::GemmCoord problem_size,
+  ScalarType alpha,
+  TensorRef<ElementA, LayoutA> tensor_a,
+  TensorRef<ElementB, LayoutB> tensor_b,
+  TensorRef<ElementC, LayoutC> tensor_d,
+  ComputeType initial_accum) {
+
+  static_assert(
+    LayoutA::kRank == 2 &&
+    LayoutC::kRank == 2, "Tensors must be of rank 2");
+
+  static_assert(SideModeA != SideMode::kInvalid
+                , "Side Mode can either be Left or Right.");
+
+  static_assert(FillModeA == FillMode::kLower || FillModeA == FillMode::kUpper
+                , "Fill Mode can either be Lower or Upper.");
+
+  using CompareOp = typename TrMatrixCompareOp<FillModeA, DiagTypeA>::Type;
+  
+  // Note: batch is ignored.
+  int const M = problem_size.m();
+  int const N = problem_size.n();
+  // Assuming correct k-dimension value is passed
+  int const K = problem_size.k();
+ 
+  // Blocking necessary to speedup reference implementation
+  int const Mblock = 16;
+  int const Nblock = 16;
+
+  ConvertOp convert_op;
+  InnerProductOp inner_product_op;
+  CompareOp compare_op;
+  
+  for (int row_block = 0; row_block < M; row_block += Mblock) {
+    for (int col_block = 0; col_block < N; col_block += Nblock) {
+
+      ComputeType accum[Mblock][Nblock];
+
+      for (int j = 0; j < Nblock; j++) {
+        for (int i = 0; i < Mblock; i++) {
+          accum[i][j] = initial_accum;
+        }
+      }
+
+      for (int k_block = 0; k_block < K; ++k_block) {
+        for (int j = 0; j < Nblock; j++) {
+          for (int i = 0; i < Mblock; i++) {
+            int row = row_block + i;
+            int col = col_block + j;
+
+            if (row < M && col < N) {
+              ElementA a = ElementA();
+              ElementB b = ElementB();
+              
+              if (SideModeA == SideMode::kLeft) {
+                a = (compare_op(row, k_block)) ? 
+                              (tensor_a.at(MatrixCoord(row, k_block))) : ElementA(0);
+                if (row == k_block && DiagTypeA == DiagType::kUnit) {
+                  a = ElementA(1);
+                }
+                b = tensor_b.at(MatrixCoord(k_block, col));
+              } else if (SideModeA == SideMode::kRight) {
+                a = tensor_b.at(MatrixCoord(row, k_block));
+                b = (compare_op(k_block, col)) ? 
+                      tensor_a.at(MatrixCoord(k_block, col)) : ElementA(0);
+                if (k_block == col && DiagTypeA == DiagType::kUnit) {
+                  b = ElementA(1);
+                }
+              }
+
+              ComputeType a_ik = ComputeType(a);
+              ComputeType b_kj = ComputeType(b);
+              
+              // Conjugate, and hence hermitian, is only allowed for the triangular matrix
+              if (SideModeA == SideMode::kLeft && TransformA == ComplexTransform::kConjugate) {
+                a_ik = conj(a_ik);
+              } else if (SideModeA == SideMode::kRight && TransformA == ComplexTransform::kConjugate) {
+                b_kj = conj(b_kj);
+              }
+
+              accum[i][j] = inner_product_op(a_ik, b_kj,  accum[i][j]);
+            }
+          }
+        }
+      }
+
+      for (int j = 0; j < Nblock; j++) {
+        for (int i = 0; i < Mblock; i++) {
+          int row = row_block + i;
+          int col = col_block + j;
+
+          MatrixCoord coord = MatrixCoord(row, col);
+
+          if (row < M && col < N) {
+            tensor_d.at(coord) = convert_op(
+              alpha * ScalarType(accum[i][j]));
+          }
+        }
+      }
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  ComplexTransform TransformA,
+  SideMode SideModeA,
+  FillMode FillModeA,
+  DiagType DiagTypeA,
+  typename ElementB,
+  typename LayoutB,
+  ComplexTransform TransformB,
+  typename ElementC,
+  typename LayoutC,
+  typename ScalarType,
+  typename ComputeType,
+  typename InnerProductOp = cutlass::arch::OpMultiplyAddComplex
+>
+struct TrmmComplex;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for multiply-add
+template <typename ElementA, typename LayoutA, ComplexTransform TransformA,
+          SideMode SideModeA, FillMode FillModeA, DiagType DiagTypeA, 
+          typename ElementB, typename LayoutB, ComplexTransform TransformB,
+          typename ElementC, typename LayoutC,
+          typename ScalarType, typename ComputeType>
+struct TrmmComplex<ElementA, LayoutA, TransformA, 
+                   SideModeA, FillModeA, DiagTypeA,
+                   ElementB, LayoutB, TransformB,
+                   ElementC, LayoutC, ScalarType,
+                   ComputeType, arch::OpMultiplyAddComplex> {
+
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b,
+                  TensorRef<ElementC, LayoutC> tensor_d,
+                  ComputeType initial_accum = ComputeType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+
+    compute_trmm_complex<ElementA, LayoutA, TransformA,
+                 SideModeA, FillModeA, DiagTypeA,
+                 ElementB, LayoutB, TransformB,
+                 ElementC, LayoutC, 
+                 ScalarType, ComputeType, multiply_add<ComputeType>>(
+                 problem_size, alpha, tensor_a, tensor_b, tensor_d, initial_accum);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for gaussian multiply-add 
+template <typename ElementA, typename LayoutA, ComplexTransform TransformA,
+          SideMode SideModeA, FillMode FillModeA, DiagType DiagTypeA, 
+          typename ElementB, typename LayoutB, ComplexTransform TransformB,
+          typename ElementC, typename LayoutC,
+          typename ScalarType, typename ComputeType>
+struct TrmmComplex<ElementA, LayoutA, TransformA, 
+                   SideModeA, FillModeA, DiagTypeA,
+                   ElementB, LayoutB, TransformB,
+                   ElementC, LayoutC, ScalarType,
+                   ComputeType, arch::OpMultiplyAddGaussianComplex> {
+
+  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
+                  TensorRef<ElementA, LayoutA> tensor_a,
+                  TensorRef<ElementB, LayoutB> tensor_b,
+                  TensorRef<ElementC, LayoutC> tensor_d,
+                  ComputeType initial_accum = ComputeType(0)) {
+    static_assert(
+        LayoutA::kRank == 2 && LayoutC::kRank == 2,
+        "Tensors must be of rank 2");
+
+    compute_trmm_complex<ElementA, LayoutA, TransformA,
+                 SideModeA, FillModeA, DiagTypeA,
+                 ElementB, LayoutB, TransformB,
+                 ElementC, LayoutC, 
+                 ScalarType, ComputeType, multiply_add<ComputeType>>(
+                 problem_size, alpha, tensor_a, tensor_b, tensor_d, initial_accum);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace host
+} // namespace reference
+} // namespace cutlass
diff --git a/tools/util/include/cutlass/util/tensor_view_io.h b/tools/util/include/cutlass/util/tensor_view_io.h
index cdb5fd79..75c07b38 100644
--- a/tools/util/include/cutlass/util/tensor_view_io.h
+++ b/tools/util/include/cutlass/util/tensor_view_io.h
@@ -1,25 +1,31 @@
 /***************************************************************************************************
-* Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
-*
-* Redistribution and use in source and binary forms, with or without modification, are permitted
-* provided that the following conditions are met:
-*     * Redistributions of source code must retain the above copyright notice, this list of
-*       conditions and the following disclaimer.
-*     * Redistributions in binary form must reproduce the above copyright notice, this list of
-*       conditions and the following disclaimer in the documentation and/or other materials
-*       provided with the distribution.
-*     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
-*       to endorse or promote products derived from this software without specific prior written
-*       permission.
-*
-* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
-* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
 #pragma once
diff --git a/tools/util/include/cutlass/util/type_traits.h b/tools/util/include/cutlass/util/type_traits.h
index 2c1f3a22..d4f157f0 100644
--- a/tools/util/include/cutlass/util/type_traits.h
+++ b/tools/util/include/cutlass/util/type_traits.h
@@ -1,24 +1,30 @@
 /***************************************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- * Redistribution and use in source and binary forms, with or without modification, are permitted
- * provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright notice, this list of
- *       conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright notice, this list of
- *       conditions and the following disclaimer in the documentation and/or other materials
- *       provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
- *       to endorse or promote products derived from this software without specific prior written
- *       permission.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/