From e123f92c72836b27afe429176be973d43e2bef4c Mon Sep 17 00:00:00 2001 From: Alexander Pivovarov Date: Thu, 9 Mar 2023 20:22:56 -0800 Subject: [PATCH] Fix typos 2 (#842) Co-authored-by: Haicheng Wu <57973641+hwu36@users.noreply.github.com> --- README.md | 8 ++--- docs/annotated.html | 10 +++--- ..._00_0c9bb6f4463ab6085e6008b5d5ad6abfd.html | 2 +- ..._00_07c56401b4df75709ae636675d9980a9a.html | 2 +- ...ayout4d0960ae6b1d1bf19e6239dbd002249c.html | 2 +- docs/command__line_8h_source.html | 4 +-- docs/device_2gemm__batched_8h.html | 2 +- ...ce_2kernel_2tensor__foreach_8h_source.html | 2 +- docs/device_2tensor__fill_8h.html | 2 +- docs/device_2tensor__fill_8h_source.html | 2 +- docs/device_2tensor__foreach_8h_source.html | 2 +- docs/functions_func_s.html | 2 +- docs/functions_s.html | 2 +- docs/hierarchy.html | 10 +++--- docs/host_2tensor__fill_8h.html | 2 +- docs/host_2tensor__fill_8h_source.html | 2 +- docs/host_2tensor__foreach_8h_source.html | 2 +- ...clude_2cutlass_2gemm_2device_2gemm_8h.html | 2 +- ...tlass_2gemm_2device_2gemm__complex_8h.html | 2 +- docs/mma__pipelined_8h_source.html | 2 +- docs/namespacecutlass_1_1gemm_1_1device.html | 6 ++-- ...mespacecutlass_1_1reference_1_1device.html | 4 +-- ...namespacecutlass_1_1reference_1_1host.html | 4 +-- docs/search/all_12.js | 2 +- docs/search/functions_12.js | 2 +- .../structcutlass_1_1CommandLine-members.html | 2 +- docs/structcutlass_1_1CommandLine.html | 4 +-- ...ayout660562b232f408218828ca5915b7e73a.html | 2 +- ...Helper_3_01Func_00_01Rank_00_010_01_4.html | 2 +- ...outB_4f3f32c4b336238abfd741e87bfced46.html | 2 +- ...Helper_3_01Func_00_01Rank_00_010_01_4.html | 2 +- ...ass_2util_2reference_2device_2gemm_8h.html | 2 +- ...tlass_2util_2reference_2host_2gemm_8h.html | 2 +- docs/wmma__sm75_8h_source.html | 2 +- examples/00_basic_gemm/basic_gemm.cu | 2 +- .../volta_tensorop_gemm.cu | 6 ++-- .../turing_tensorop_gemm.cu | 6 ++-- .../turing_tensorop_conv2dfprop.cu | 6 ++-- .../threadblock/b2b_implicit_gemm_pipelined.h | 4 +-- ...implicit_gemm_pipelined_smem_accumulator.h | 2 +- .../threadblock/b2b_mma_pipelined.h | 2 +- .../b2b_mma_pipelined_smem_accumulator.h | 2 +- .../ampere_tensorop_conv2dfprop.cu | 6 ++-- .../ampere_gemm_operand_reduction_fusion.cu | 2 +- .../fused_multihead_attention_fixed_seqlen.cu | 2 +- ...sed_multihead_attention_variable_seqlen.cu | 2 +- .../gemm/custom_mma_pipelined.h | 2 +- .../gemm/mma_from_smem.h | 2 +- ...cated_tile_access_iterator_residual_last.h | 2 +- .../ir_gen/gen_threadblock.py | 34 +++++++++---------- .../ir_gen/gen_verify.py | 8 ++--- .../ir_gen/helper.py | 10 +++--- ..._gemm_schedules_with_collective_builder.cu | 14 ++++---- include/cute/atom/copy_traits_sm90_tma.hpp | 2 +- include/cutlass/arch/mma.h | 4 +-- ..._gradient_tile_access_iterator_optimized.h | 6 ++-- ...activation_tile_access_iterator_analytic.h | 2 +- ...ctivation_tile_access_iterator_optimized.h | 2 +- .../threadblock/depthwise_fprop_pipelined.h | 2 +- .../threadblock/implicit_gemm_pipelined.h | 2 +- .../conv/threadblock/threadblock_swizzle.h | 6 ++-- .../default_epilogue_complex_tensor_op.h | 2 +- ...default_epilogue_complex_tensor_op_blas3.h | 2 +- .../predicated_tile_iterator_direct_conv.h | 2 +- .../predicated_tile_iterator_strided_dgrad.h | 4 +-- include/cutlass/gemm/device/ell_gemm.h | 2 +- include/cutlass/gemm/device/gemm.h | 2 +- include/cutlass/gemm/device/gemm_array.h | 2 +- include/cutlass/gemm/device/gemm_batched.h | 2 +- include/cutlass/gemm/device/gemm_complex.h | 2 +- .../device/gemm_layernorm_mainloop_fusion.h | 2 +- include/cutlass/gemm/device/gemm_universal.h | 2 +- .../device/gemm_universal_with_broadcast.h | 2 +- .../gemm/device/gemm_with_k_reduction.h | 2 +- include/cutlass/gemm/device/rank_2k.h | 2 +- include/cutlass/gemm/device/rank_k.h | 2 +- include/cutlass/gemm/device/symm.h | 2 +- include/cutlass/gemm/device/trmm.h | 2 +- .../gemm/kernel/default_gemm_with_broadcast.h | 2 +- .../gemm/kernel/default_gemm_with_reduction.h | 2 +- .../kernel/rank_2k_grouped_problem_visitor.h | 2 +- include/cutlass/gemm/kernel/sm90_gemm_tma.hpp | 2 +- .../kernel/sm90_gemm_tma_warpspecialized.hpp | 2 +- ...90_gemm_tma_warpspecialized_persistent.hpp | 2 +- .../gemm/threadblock/ell_mma_pipelined.h | 2 +- .../mma_planar_complex_pipelined.h | 2 +- .../kernel/tensor_reduce_affine_contiguous.h | 2 +- .../kernel/tensor_reduce_affine_strided.h | 2 +- .../predicated_tile_access_iterator.h | 2 +- .../regular_tile_iterator_tensor_op_sm70.h | 2 +- media/docs/implicit_gemm_convolution.md | 2 +- media/docs/quickstart.md | 7 ++-- test/unit/conv/device/conv2d_testbed.h | 2 +- .../conv/device/conv2d_testbed_interleaved.h | 2 +- .../device/conv2d_with_broadcast_testbed.h | 2 +- .../device/conv2d_with_reduction_testbed.h | 2 +- test/unit/conv/device/conv3d_testbed.h | 2 +- .../device/default_gemm_configuration.hpp | 2 +- .../library/include/cutlass/library/handle.h | 4 +-- .../library/include/cutlass/library/library.h | 14 ++++---- tools/library/scripts/generator.py | 4 +-- tools/library/scripts/pycutlass/README.md | 4 +-- .../pycutlass/docs/source/md/basic_idea.md | 8 ++--- .../scripts/pycutlass/src/cpp/include/arch.h | 6 ++-- .../src/cpp/include/conv/conv_problem_size.h | 2 +- .../epilogue_visitor_op/visitor_op_binary.h | 2 +- .../visitor_op_column_reduction.h | 4 +-- .../visitor_op_linear_combination.h | 2 +- .../visitor_op_row_reduction.h | 4 +-- .../pycutlass/src/cpp/include/gemm/gemm.h | 2 +- .../pycutlass/src/cpp/include/swizzling.h | 6 ++-- .../src/cpp/include/tensor_ref_view.h | 2 +- .../builder/collective_op_builder.py | 8 ++--- .../pycutlass/src/pycutlass/c_types.py | 2 +- .../pycutlass/src/pycutlass/gemm_operation.py | 16 ++++----- .../pycutlass/src/pycutlass/library.py | 2 +- .../src/pycutlass/reduction_operation.py | 4 +-- .../src/pycutlass/test/conv2d_testbed.py | 2 +- .../src/pycutlass/test/gemm_testbed.py | 2 +- .../pycutlass/src/pycutlass/test/utils.py | 2 +- .../pycutlass/test/gemm/gemm_bf16_sm90.py | 2 +- .../pycutlass/test/gemm/gemm_f16_sm90.py | 4 +-- .../pycutlass/test/gemm/gemm_f64_sm90.py | 2 +- .../pycutlass/test/gemm/gemm_s8_sm90.py | 2 +- tools/library/src/conv2d_operation.h | 4 +-- tools/library/src/conv3d_operation.h | 2 +- tools/library/src/handle.cu | 4 +-- tools/library/src/rank_2k_operation.h | 2 +- .../src/reduction/reduction_operation.h | 2 +- .../src/reference/conv_reference_operation.h | 2 +- tools/library/src/symm_operation.h | 2 +- tools/library/src/util.cu | 6 ++-- .../profiler/src/conv2d_operation_profiler.cu | 26 +++++++------- .../profiler/src/conv2d_operation_profiler.h | 8 ++--- .../profiler/src/conv3d_operation_profiler.cu | 16 ++++----- .../profiler/src/conv3d_operation_profiler.h | 10 +++--- tools/profiler/src/cublas_helpers.cu | 2 +- tools/profiler/src/cublas_helpers.h | 12 +++---- tools/profiler/src/cudnn_helpers.cpp | 6 ++-- tools/profiler/src/cudnn_helpers.h | 10 +++--- tools/profiler/src/debug.h | 2 +- tools/profiler/src/device_allocation.cu | 4 +-- tools/profiler/src/device_allocation.h | 4 +-- tools/profiler/src/gemm_operation_profiler.cu | 8 ++--- tools/profiler/src/gpu_timer.cpp | 2 +- tools/profiler/src/gpu_timer.h | 2 +- tools/profiler/src/operation_profiler.h | 2 +- tools/profiler/src/options.cu | 4 +-- tools/profiler/src/options.h | 2 +- tools/profiler/src/problem_space.h | 4 +-- .../util/include/cutlass/util/command_line.h | 6 ++-- .../include/cutlass/util/device_groupnorm.h | 2 +- .../cutlass/util/device_nhwc_padding.h | 12 +++---- .../util/include/cutlass/util/helper_cuda.hpp | 2 +- .../cutlass/util/reference/device/gemm.h | 2 +- .../reference/device/kernel/tensor_foreach.h | 2 +- .../util/reference/device/tensor_fill.h | 2 +- .../util/reference/device/tensor_foreach.h | 2 +- .../cutlass/util/reference/host/gemm.h | 2 +- .../cutlass/util/reference/host/tensor_fill.h | 2 +- .../util/reference/host/tensor_foreach.h | 2 +- 161 files changed, 310 insertions(+), 309 deletions(-) diff --git a/README.md b/README.md index dc7cde32..79c11b3e 100644 --- a/README.md +++ b/README.md @@ -328,7 +328,7 @@ or a subset of kernels for NVIDIA Ampere and Turing architecture: ### Building a subset Tensor Core GEMM kernels -To compile a subset of Tensor Core GEMM kernels with FP32 accumulation and FP16 input targetting NVIDIA Ampere and Turing architecture, +To compile a subset of Tensor Core GEMM kernels with FP32 accumulation and FP16 input targeting NVIDIA Ampere and Turing architecture, use the below cmake command line: ```bash $ cmake .. -DCUTLASS_NVCC_ARCHS='75;80' -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_s*gemm_f16_*_nt_align8 @@ -376,7 +376,7 @@ reference_device: Passed ### Building one CUDA Core GEMM kernel -To compile one SGEMM kernel targetting NVIDIA Ampere and Turing architecture, use the below cmake command line: +To compile one SGEMM kernel targeting NVIDIA Ampere and Turing architecture, use the below cmake command line: ```bash $ cmake .. -DCUTLASS_NVCC_ARCHS='75;80' -DCUTLASS_LIBRARY_KERNELS=cutlass_simt_sgemm_128x128_8x2_nn_align1 ... @@ -418,7 +418,7 @@ $ ./tools/profiler/cutlass_profiler --kernels=sgemm --m=3456 --n=4096 --k=4096 ### Building a subset of Tensor Core Convolution kernels To compile a subset of Tensor core convolution kernels implementing forward propagation (fprop) with FP32 accumulation -and FP16 input targetting NVIDIA Ampere and Turing architecture, use the below cmake command line: +and FP16 input targeting NVIDIA Ampere and Turing architecture, use the below cmake command line: ```bash $ cmake .. -DCUTLASS_NVCC_ARCHS='75;80' -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_s*fprop_optimized_f16 ... @@ -466,7 +466,7 @@ reference_device: Passed ### Building one Convolution CUDA kernel To compile and run one CUDA Core convolution kernel implementing forward propagation (fprop) with F32 accumulation -and FP32 input targetting NVIDIA Ampere and Turing architecture, use the below cmake command line: +and FP32 input targeting NVIDIA Ampere and Turing architecture, use the below cmake command line: ```bash $ cmake .. -DCUTLASS_NVCC_ARCHS='75;80' -DCUTLASS_LIBRARY_KERNELS=cutlass_simt_sfprop_optimized_128x128_8x2_nhwc ... diff --git a/docs/annotated.html b/docs/annotated.html index 43923cc2..233691c2 100644 --- a/docs/annotated.html +++ b/docs/annotated.html @@ -280,15 +280,15 @@  CDefaultGemmConfiguration< arch::OpClassWmmaTensorOp, ArchTag, ElementA, ElementB, ElementC, ElementAccumulator >  CGemm  CArgumentsArgument structure - CGemm< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial, Operator_, IsBetaZero >Parital specialization for column-major output exchanges problem size and operand + CGemm< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial, Operator_, IsBetaZero >Partial specialization for column-major output exchanges problem size and operand  CArgumentsArgument structure  CGemmBatched  CArgumentsArgument structure - CGemmBatched< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ >Parital specialization for column-major output exchanges problem size and operand + CGemmBatched< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ >Partial specialization for column-major output exchanges problem size and operand  CArgumentsArgument structure  CGemmComplex  CArgumentsArgument structure - CGemmComplex< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial >Parital specialization for column-major output exchanges problem size and operand + CGemmComplex< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial >Partial specialization for column-major output exchanges problem size and operand  CArgumentsArgument structure  CGemmSplitKParallel  CArgumentsArgument structure @@ -594,7 +594,7 @@  CGemm  CGemm< ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, AccumulatorType, arch::OpMultiplyAdd >Partial specialization for multiply-add  CGemm< ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, AccumulatorType, arch::OpMultiplyAddSaturate >Partial specialization for multiply-add-saturate - CGemm< ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, AccumulatorType, arch::OpXorPopc >Parital specialization for XOR-popc + CGemm< ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, AccumulatorType, arch::OpXorPopc >Partial specialization for XOR-popc  CTensorDiagonalForEachLaunches a kernel calling a functor for each element along a tensor's diagonal  CTensorForEachLaunches a kernel calling a functor for each element in a tensor's index space  Nhost @@ -620,7 +620,7 @@  CGemm  CGemm< ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, arch::OpMultiplyAdd >Partial specialization for multiply-add  CGemm< ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, arch::OpMultiplyAddSaturate >Partial specialization for multiply-add-saturate - CGemm< ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, arch::OpXorPopc >Parital specialization for XOR-popc + CGemm< ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, arch::OpXorPopc >Partial specialization for XOR-popc  Nthread  CMatrixPer-thread matrix object storing a packed matrix  Ntransform diff --git a/docs/classcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_0c9bb6f4463ab6085e6008b5d5ad6abfd.html b/docs/classcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_0c9bb6f4463ab6085e6008b5d5ad6abfd.html index d53d8d67..6800f4fe 100644 --- a/docs/classcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_0c9bb6f4463ab6085e6008b5d5ad6abfd.html +++ b/docs/classcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_0c9bb6f4463ab6085e6008b5d5ad6abfd.html @@ -108,7 +108,7 @@
-

Parital specialization for column-major output exchanges problem size and operand. +

Partial specialization for column-major output exchanges problem size and operand.

#include <gemm_batched.h>

diff --git a/docs/classcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_07c56401b4df75709ae636675d9980a9a.html b/docs/classcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_07c56401b4df75709ae636675d9980a9a.html index 84685856..d0978343 100644 --- a/docs/classcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_07c56401b4df75709ae636675d9980a9a.html +++ b/docs/classcutlass_1_1gemm_1_1device_1_1GemmComplex_3_01ElementA___00_01LayoutA___00_01ElementB___00_07c56401b4df75709ae636675d9980a9a.html @@ -108,7 +108,7 @@
-

Parital specialization for column-major output exchanges problem size and operand. +

Partial specialization for column-major output exchanges problem size and operand.

#include <gemm_complex.h>

diff --git a/docs/classcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layout4d0960ae6b1d1bf19e6239dbd002249c.html b/docs/classcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layout4d0960ae6b1d1bf19e6239dbd002249c.html index f34be6e5..323ecfc2 100644 --- a/docs/classcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layout4d0960ae6b1d1bf19e6239dbd002249c.html +++ b/docs/classcutlass_1_1gemm_1_1device_1_1Gemm_3_01ElementA___00_01LayoutA___00_01ElementB___00_01Layout4d0960ae6b1d1bf19e6239dbd002249c.html @@ -108,7 +108,7 @@
-

Parital specialization for column-major output exchanges problem size and operand. +

Partial specialization for column-major output exchanges problem size and operand.

#include <gemm.h>

diff --git a/docs/command__line_8h_source.html b/docs/command__line_8h_source.html index f98c9f1e..325a3034 100644 --- a/docs/command__line_8h_source.html +++ b/docs/command__line_8h_source.html @@ -98,7 +98,7 @@
command_line.h
-Go to the documentation of this file.
1 /******************************************************************************
2  * Copyright (c) 2011-2019, NVIDIA CORPORATION. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are not permitted.
6  *
7  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
8  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
9  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
10  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
11  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
12  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
13  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
14  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
15  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
16  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
17  *
18  ******************************************************************************/
19 
20 #pragma once
21 
27 #include <iostream>
28 #include <limits>
29 #include <sstream>
30 #include <string>
31 #include <vector>
32 
33 #include <cuda_runtime.h>
34 
35 namespace cutlass {
36 
37 /******************************************************************************
38  * command_line
39  ******************************************************************************/
40 
44 struct CommandLine {
45  std::vector<std::string> keys;
46  std::vector<std::string> values;
47  std::vector<std::string> args;
48 
52  CommandLine(int argc, const char** argv) {
53  using namespace std;
54 
55  for (int i = 1; i < argc; i++) {
56  string arg = argv[i];
57 
58  if ((arg[0] != '-') || (arg[1] != '-')) {
59  args.push_back(arg);
60  continue;
61  }
62 
63  string::size_type pos;
64  string key, val;
65  if ((pos = arg.find('=')) == string::npos) {
66  key = string(arg, 2, arg.length() - 2);
67  val = "";
68  } else {
69  key = string(arg, 2, pos - 2);
70  val = string(arg, pos + 1, arg.length() - 1);
71  }
72 
73  keys.push_back(key);
74  values.push_back(val);
75  }
76  }
77 
81  bool check_cmd_line_flag(const char* arg_name) const {
82  using namespace std;
83 
84  for (int i = 0; i < int(keys.size()); ++i) {
85  if (keys[i] == string(arg_name)) return true;
86  }
87  return false;
88  }
89 
93  template <typename value_t>
94  int num_naked_args() const {
95  return args.size();
96  }
97 
101  template <typename value_t>
102  void get_cmd_line_argument(int index, value_t& val) const {
103  using namespace std;
104  if (index < args.size()) {
105  istringstream str_stream(args[index]);
106  str_stream >> val;
107  }
108  }
109 
113  void get_cmd_line_argument(const char* arg_name, bool& val, bool _default = true) const {
114  val = _default;
115  if (check_cmd_line_flag(arg_name)) {
116  std::string value;
117  get_cmd_line_argument(arg_name, value);
118 
119  val = !(value == "0" || value == "false");
120  }
121  }
122 
126  template <typename value_t>
127  void get_cmd_line_argument(const char* arg_name,
128  value_t& val,
129  value_t const& _default = value_t()) const {
130  using namespace std;
131 
132  val = _default;
133 
134  for (int i = 0; i < int(keys.size()); ++i) {
135  if (keys[i] == string(arg_name)) {
136  istringstream str_stream(values[i]);
137  str_stream >> val;
138  }
139  }
140  }
141 
145  template <typename value_t>
146  void get_cmd_line_arguments(const char* arg_name,
147  std::vector<value_t>& vals,
148  char sep = ',') const {
149  using namespace std;
150 
151  if (check_cmd_line_flag(arg_name)) {
152  // Clear any default values
153  vals.clear();
154 
155  // Recover from multi-value string
156  for (int i = 0; i < keys.size(); ++i) {
157  if (keys[i] == string(arg_name)) {
158  string val_string(values[i]);
159  seperate_string(val_string, vals, sep);
160  }
161  }
162  }
163  }
164 
169  void get_cmd_line_argument_pairs(const char* arg_name,
170  std::vector<std::pair<std::string, std::string> >& tokens,
171  char delim = ',',
172  char sep = ':') const {
173  if (check_cmd_line_flag(arg_name)) {
174  std::string value;
175  get_cmd_line_argument(arg_name, value);
176 
177  tokenize(tokens, value, delim, sep);
178  }
179  }
180 
185  void get_cmd_line_argument_ranges(const char* arg_name,
186  std::vector<std::vector<std::string> >& vals,
187  char delim = ',',
188  char sep = ':') const {
189  std::vector<std::string> ranges;
190  get_cmd_line_arguments(arg_name, ranges, delim);
191 
192  for (std::vector<std::string>::const_iterator range = ranges.begin();
193  range != ranges.end(); ++range) {
194 
195  std::vector<std::string> range_vals;
196  seperate_string(*range, range_vals, sep);
197  vals.push_back(range_vals);
198  }
199  }
200 
204  int parsed_argc() const { return (int)keys.size(); }
205 
206  //-------------------------------------------------------------------------
207  // Utility functions
208  //-------------------------------------------------------------------------
209 
211  static void tokenize(std::vector<std::pair<std::string, std::string> >& tokens,
212  std::string const& str,
213  char delim = ',',
214  char sep = ':') {
215  // Home-built to avoid Boost dependency
216  size_t s_idx = 0;
217  size_t d_idx = std::string::npos;
218  while (s_idx < str.size()) {
219  d_idx = str.find_first_of(delim, s_idx);
220 
221  size_t end_idx = (d_idx != std::string::npos ? d_idx : str.size());
222  size_t sep_idx = str.find_first_of(sep, s_idx);
223  size_t offset = 1;
224  if (sep_idx == std::string::npos || sep_idx >= end_idx) {
225  sep_idx = end_idx;
226  offset = 0;
227  }
228 
229  std::pair<std::string, std::string> item(
230  str.substr(s_idx, sep_idx - s_idx),
231  str.substr(sep_idx + offset, end_idx - sep_idx - offset));
232 
233  tokens.push_back(item);
234  s_idx = end_idx + 1;
235  }
236  }
237 
239  static void tokenize(std::vector<std::string>& tokens,
240  std::string const& str,
241  char delim = ',',
242  char sep = ':') {
243  typedef std::vector<std::pair<std::string, std::string> > TokenVector;
244  typedef TokenVector::const_iterator token_iterator;
245 
246  std::vector<std::pair<std::string, std::string> > token_pairs;
247  tokenize(token_pairs, str, delim, sep);
248  for (token_iterator tok = token_pairs.begin(); tok != token_pairs.end(); ++tok) {
249  tokens.push_back(tok->first);
250  }
251  }
252 
253  template <typename value_t>
254  static void seperate_string(std::string const& str,
255  std::vector<value_t>& vals,
256  char sep = ',') {
257  std::istringstream str_stream(str);
258  std::string::size_type old_pos = 0;
259  std::string::size_type new_pos = 0;
260 
261  // Iterate <sep>-delimited values
262  value_t val;
263  while ((new_pos = str.find(sep, old_pos)) != std::string::npos) {
264  if (new_pos != old_pos) {
265  str_stream.width(new_pos - old_pos);
266  str_stream >> val;
267  vals.push_back(val);
268  }
269 
270  // skip over delimiter
271  str_stream.ignore(1);
272  old_pos = new_pos + 1;
273  }
274 
275  // Read last value
276  str_stream >> val;
277  vals.push_back(val);
278  }
279 };
280 
281 } // namespace cutlass
Definition: aligned_buffer.h:35
+Go to the documentation of this file.
1 /******************************************************************************
2  * Copyright (c) 2011-2019, NVIDIA CORPORATION. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are not permitted.
6  *
7  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
8  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
9  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
10  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
11  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
12  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
13  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
14  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
15  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
16  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
17  *
18  ******************************************************************************/
19 
20 #pragma once
21 
27 #include <iostream>
28 #include <limits>
29 #include <sstream>
30 #include <string>
31 #include <vector>
32 
33 #include <cuda_runtime.h>
34 
35 namespace cutlass {
36 
37 /******************************************************************************
38  * command_line
39  ******************************************************************************/
40 
44 struct CommandLine {
45  std::vector<std::string> keys;
46  std::vector<std::string> values;
47  std::vector<std::string> args;
48 
52  CommandLine(int argc, const char** argv) {
53  using namespace std;
54 
55  for (int i = 1; i < argc; i++) {
56  string arg = argv[i];
57 
58  if ((arg[0] != '-') || (arg[1] != '-')) {
59  args.push_back(arg);
60  continue;
61  }
62 
63  string::size_type pos;
64  string key, val;
65  if ((pos = arg.find('=')) == string::npos) {
66  key = string(arg, 2, arg.length() - 2);
67  val = "";
68  } else {
69  key = string(arg, 2, pos - 2);
70  val = string(arg, pos + 1, arg.length() - 1);
71  }
72 
73  keys.push_back(key);
74  values.push_back(val);
75  }
76  }
77 
81  bool check_cmd_line_flag(const char* arg_name) const {
82  using namespace std;
83 
84  for (int i = 0; i < int(keys.size()); ++i) {
85  if (keys[i] == string(arg_name)) return true;
86  }
87  return false;
88  }
89 
93  template <typename value_t>
94  int num_naked_args() const {
95  return args.size();
96  }
97 
101  template <typename value_t>
102  void get_cmd_line_argument(int index, value_t& val) const {
103  using namespace std;
104  if (index < args.size()) {
105  istringstream str_stream(args[index]);
106  str_stream >> val;
107  }
108  }
109 
113  void get_cmd_line_argument(const char* arg_name, bool& val, bool _default = true) const {
114  val = _default;
115  if (check_cmd_line_flag(arg_name)) {
116  std::string value;
117  get_cmd_line_argument(arg_name, value);
118 
119  val = !(value == "0" || value == "false");
120  }
121  }
122 
126  template <typename value_t>
127  void get_cmd_line_argument(const char* arg_name,
128  value_t& val,
129  value_t const& _default = value_t()) const {
130  using namespace std;
131 
132  val = _default;
133 
134  for (int i = 0; i < int(keys.size()); ++i) {
135  if (keys[i] == string(arg_name)) {
136  istringstream str_stream(values[i]);
137  str_stream >> val;
138  }
139  }
140  }
141 
145  template <typename value_t>
146  void get_cmd_line_arguments(const char* arg_name,
147  std::vector<value_t>& vals,
148  char sep = ',') const {
149  using namespace std;
150 
151  if (check_cmd_line_flag(arg_name)) {
152  // Clear any default values
153  vals.clear();
154 
155  // Recover from multi-value string
156  for (int i = 0; i < keys.size(); ++i) {
157  if (keys[i] == string(arg_name)) {
158  string val_string(values[i]);
159  separate_string(val_string, vals, sep);
160  }
161  }
162  }
163  }
164 
169  void get_cmd_line_argument_pairs(const char* arg_name,
170  std::vector<std::pair<std::string, std::string> >& tokens,
171  char delim = ',',
172  char sep = ':') const {
173  if (check_cmd_line_flag(arg_name)) {
174  std::string value;
175  get_cmd_line_argument(arg_name, value);
176 
177  tokenize(tokens, value, delim, sep);
178  }
179  }
180 
185  void get_cmd_line_argument_ranges(const char* arg_name,
186  std::vector<std::vector<std::string> >& vals,
187  char delim = ',',
188  char sep = ':') const {
189  std::vector<std::string> ranges;
190  get_cmd_line_arguments(arg_name, ranges, delim);
191 
192  for (std::vector<std::string>::const_iterator range = ranges.begin();
193  range != ranges.end(); ++range) {
194 
195  std::vector<std::string> range_vals;
196  separate_string(*range, range_vals, sep);
197  vals.push_back(range_vals);
198  }
199  }
200 
204  int parsed_argc() const { return (int)keys.size(); }
205 
206  //-------------------------------------------------------------------------
207  // Utility functions
208  //-------------------------------------------------------------------------
209 
211  static void tokenize(std::vector<std::pair<std::string, std::string> >& tokens,
212  std::string const& str,
213  char delim = ',',
214  char sep = ':') {
215  // Home-built to avoid Boost dependency
216  size_t s_idx = 0;
217  size_t d_idx = std::string::npos;
218  while (s_idx < str.size()) {
219  d_idx = str.find_first_of(delim, s_idx);
220 
221  size_t end_idx = (d_idx != std::string::npos ? d_idx : str.size());
222  size_t sep_idx = str.find_first_of(sep, s_idx);
223  size_t offset = 1;
224  if (sep_idx == std::string::npos || sep_idx >= end_idx) {
225  sep_idx = end_idx;
226  offset = 0;
227  }
228 
229  std::pair<std::string, std::string> item(
230  str.substr(s_idx, sep_idx - s_idx),
231  str.substr(sep_idx + offset, end_idx - sep_idx - offset));
232 
233  tokens.push_back(item);
234  s_idx = end_idx + 1;
235  }
236  }
237 
239  static void tokenize(std::vector<std::string>& tokens,
240  std::string const& str,
241  char delim = ',',
242  char sep = ':') {
243  typedef std::vector<std::pair<std::string, std::string> > TokenVector;
244  typedef TokenVector::const_iterator token_iterator;
245 
246  std::vector<std::pair<std::string, std::string> > token_pairs;
247  tokenize(token_pairs, str, delim, sep);
248  for (token_iterator tok = token_pairs.begin(); tok != token_pairs.end(); ++tok) {
249  tokens.push_back(tok->first);
250  }
251  }
252 
253  template <typename value_t>
254  static void separate_string(std::string const& str,
255  std::vector<value_t>& vals,
256  char sep = ',') {
257  std::istringstream str_stream(str);
258  std::string::size_type old_pos = 0;
259  std::string::size_type new_pos = 0;
260 
261  // Iterate <sep>-delimited values
262  value_t val;
263  while ((new_pos = str.find(sep, old_pos)) != std::string::npos) {
264  if (new_pos != old_pos) {
265  str_stream.width(new_pos - old_pos);
266  str_stream >> val;
267  vals.push_back(val);
268  }
269 
270  // skip over delimiter
271  str_stream.ignore(1);
272  old_pos = new_pos + 1;
273  }
274 
275  // Read last value
276  str_stream >> val;
277  vals.push_back(val);
278  }
279 };
280 
281 } // namespace cutlass
Definition: aligned_buffer.h:35
void get_cmd_line_argument(const char *arg_name, value_t &val, value_t const &_default=value_t()) const
Definition: command_line.h:127
void get_cmd_line_argument_pairs(const char *arg_name, std::vector< std::pair< std::string, std::string > > &tokens, char delim= ',', char sep= ':') const
Definition: command_line.h:169
STL namespace.
@@ -116,7 +116,7 @@
CommandLine(int argc, const char **argv)
Definition: command_line.h:52
std::vector< std::string > args
Definition: command_line.h:47
Definition: command_line.h:44
-
static void seperate_string(std::string const &str, std::vector< value_t > &vals, char sep= ',')
Definition: command_line.h:254
+
static void separate_string(std::string const &str, std::vector< value_t > &vals, char sep= ',')
Definition: command_line.h:254
int parsed_argc() const
Definition: command_line.h:204
diff --git a/docs/device_2gemm__batched_8h.html b/docs/device_2gemm__batched_8h.html index e648c67e..3ef58bf1 100644 --- a/docs/device_2gemm__batched_8h.html +++ b/docs/device_2gemm__batched_8h.html @@ -130,7 +130,7 @@  Argument structure. More...
  class  cutlass::gemm::device::GemmBatched< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ > - Parital specialization for column-major output exchanges problem size and operand. More...
+ Partial specialization for column-major output exchanges problem size and operand. More...
  struct  cutlass::gemm::device::GemmBatched< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ >::Arguments  Argument structure. More...
diff --git a/docs/device_2kernel_2tensor__foreach_8h_source.html b/docs/device_2kernel_2tensor__foreach_8h_source.html index a4839c25..21c8fb59 100644 --- a/docs/device_2kernel_2tensor__foreach_8h_source.html +++ b/docs/device_2kernel_2tensor__foreach_8h_source.html @@ -100,7 +100,7 @@
Go to the documentation of this file.
1 /***************************************************************************************************
2  * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without modification, are permitted
5  * provided that the following conditions are met:
6  * * Redistributions of source code must retain the above copyright notice, this list of
7  * conditions and the following disclaimer.
8  * * Redistributions in binary form must reproduce the above copyright notice, this list of
9  * conditions and the following disclaimer in the documentation and/or other materials
10  * provided with the distribution.
11  * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
12  * to endorse or promote products derived from this software without specific prior written
13  * permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
20  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
21  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
22  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23  *
24  **************************************************************************************************/
25 
26 #pragma once
27 
28 #include "cutlass/cutlass.h"
29 #include "cutlass/coord.h"
30 
31 namespace cutlass {
32 namespace reference {
33 namespace device {
34 namespace kernel {
35 
37 
39 namespace detail {
40 
42 template <typename Func, int Rank, int RankRemaining>
44 
46  __inline__ __device__
47  TensorForEachHelper(Func &func, Coord<Rank> const &size, Coord<Rank> &coord, int64_t index) {
48 
49  int64_t product = 1;
50 
52  for (int i = Rank - RankRemaining; i < Rank; ++i) {
53  product *= size[i];
54  }
55 
56  coord[Rank - 1 - RankRemaining] = index / product;
57  int64_t remaining = index % product;
58 
59  TensorForEachHelper<Func, Rank, RankRemaining-1>(func, size, coord, remaining);
60  }
61 };
62 
64 template <typename Func, int Rank>
65 struct TensorForEachHelper<Func, Rank, 0> {
66 
68  __inline__ __device__
69  TensorForEachHelper(Func &func, Coord<Rank> const &size, Coord<Rank> &coord, int64_t index) {
70 
71  coord[Rank - 1] = index;
72 
73  if (coord < size) {
74  func(coord);
75  }
76  }
77 };
78 
79 } // namespace detail
80 
82 
84 template <typename Func, int Rank, typename Params>
85 __global__ void TensorForEach(Coord<Rank> size, Params params = Params()) {
86 
87  Func func(params);
88 
89  int64_t index = threadIdx.x + blockIdx.x * blockDim.x;
90  int64_t max_index = 1;
91 
93  for (int i = 0; i < Rank; ++i) {
94  max_index *= size[i];
95  }
96 
98  while (index < max_index) {
99  Coord<Rank> coord;
100 
101  detail::TensorForEachHelper<Func, Rank, Rank - 1>(func, size, coord, index);
102  index += blockDim.x * gridDim.x;
103  }
104 }
105 
107 
109 template <typename Func, int Rank, typename Params>
110 __global__ void TensorDiagonalForEach(Coord<Rank> size, Params params, int start, int end) {
111 
112  Func func(params);
113 
114  int64_t index = threadIdx.x + blockIdx.x * blockDim.x + start;
115 
116  if (index < end) {
117  Coord<Rank> coord;
118 
120  for (int i = 0; i < Rank; ++i) {
121  coord[i] = index;
122  }
123 
124  func(coord);
125  }
126 }
127 
129 
130 template <typename Element, typename Func>
131 __global__ void BlockForEach(
132  Element *ptr,
133  size_t capacity,
134  typename Func::Params params) {
135 
136  Func func(params);
137 
138  size_t index = threadIdx.x + blockIdx.x * blockDim.x;
139 
140  for (; index < capacity; index += blockDim.x * gridDim.x) {
141  ptr[index] = func();
142  }
143 }
144 
146 
147 } // namespace kernel
148 } // namespace device
149 } // namespace reference
150 } // namespace cutlass
151 
Definition: aligned_buffer.h:35
A Coord is a coordinate of arbitrary rank into a tensor or matrix.
-
__inline__ __device__ TensorForEachHelper(Func &func, Coord< Rank > const &size, Coord< Rank > &coord, int64_t index)
Constructor for fastest chaning rank.
Definition: device/kernel/tensor_foreach.h:69
+
__inline__ __device__ TensorForEachHelper(Func &func, Coord< Rank > const &size, Coord< Rank > &coord, int64_t index)
Constructor for fastest changing rank.
Definition: device/kernel/tensor_foreach.h:69
#define CUTLASS_PRAGMA_UNROLL
Definition: cutlass.h:110
__global__ void BlockForEach(Element *ptr, size_t capacity, typename Func::Params params)
Definition: device/kernel/tensor_foreach.h:131
#define CUTLASS_PRAGMA_NO_UNROLL
Definition: cutlass.h:111
diff --git a/docs/device_2tensor__fill_8h.html b/docs/device_2tensor__fill_8h.html index c2af8c44..5a99459c 100644 --- a/docs/device_2tensor__fill_8h.html +++ b/docs/device_2tensor__fill_8h.html @@ -237,7 +237,7 @@   template<typename Element , typename Layout > void cutlass::reference::device::TensorFillIdentity (TensorView< Element, Layout > view) - Fills a tensor's digonal with 1 and 0 everywhere else. More...
+ Fills a tensor's diagonal with 1 and 0 everywhere else. More...
  template<typename Element , typename Layout > void cutlass::reference::device::TensorUpdateDiagonal (TensorView< Element, Layout > view, Element diag=Element(1)) diff --git a/docs/device_2tensor__fill_8h_source.html b/docs/device_2tensor__fill_8h_source.html index dd5debda..908ccba8 100644 --- a/docs/device_2tensor__fill_8h_source.html +++ b/docs/device_2tensor__fill_8h_source.html @@ -125,7 +125,7 @@
Parameters structure.
Definition: device/tensor_fill.h:99
Kind kind
Active variant kind.
Definition: distribution.h:64
Params(TensorView view_=TensorView(), typename RandomFunc::Params random_=RandomFunc::Params())
Construction of Gaussian RNG functor.
Definition: device/tensor_fill.h:422
-
void TensorFillIdentity(TensorView< Element, Layout > view)
Fills a tensor&#39;s digonal with 1 and 0 everywhere else.
Definition: device/tensor_fill.h:630
+
void TensorFillIdentity(TensorView< Element, Layout > view)
Fills a tensor&#39;s diagonal with 1 and 0 everywhere else.
Definition: device/tensor_fill.h:630
CUTLASS_HOST_DEVICE TensorCoord const & extent() const
Returns the extent of the view (the size along each logical dimension).
Definition: tensor_view.h:167
Computes a random Gaussian distribution.
Definition: device/tensor_fill.h:645
int int_scale
Definition: device/tensor_fill.h:315
diff --git a/docs/device_2tensor__foreach_8h_source.html b/docs/device_2tensor__foreach_8h_source.html index 90c5402e..0380fa93 100644 --- a/docs/device_2tensor__foreach_8h_source.html +++ b/docs/device_2tensor__foreach_8h_source.html @@ -98,7 +98,7 @@
device/tensor_foreach.h
-Go to the documentation of this file.
1 /***************************************************************************************************
2  * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without modification, are permitted
5  * provided that the following conditions are met:
6  * * Redistributions of source code must retain the above copyright notice, this list of
7  * conditions and the following disclaimer.
8  * * Redistributions in binary form must reproduce the above copyright notice, this list of
9  * conditions and the following disclaimer in the documentation and/or other materials
10  * provided with the distribution.
11  * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
12  * to endorse or promote products derived from this software without specific prior written
13  * permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
20  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
21  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
22  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23  *
24  **************************************************************************************************/
25 #pragma once
26 
27 #include <stdexcept>
28 #include "cutlass/cutlass.h"
30 
31 namespace cutlass {
32 namespace reference {
33 namespace device {
34 
36 
38 template <typename Func, int Rank, typename Params>
39 struct TensorForEach {
40 
42  TensorForEach(Coord<Rank> size, Params params = Params(), int grid_size = 0, int block_size = 0) {
43 
44  if (!grid_size || !block_size) {
45 
46  // if grid_size or block_size are zero, query occupancy using the CUDA Occupancy API
47  cudaError_t result = cudaOccupancyMaxPotentialBlockSize(
48  &grid_size,
49  &block_size,
50  reinterpret_cast<void const *>(kernel::TensorForEach<Func, Rank, Params>));
51 
52  if (result != cudaSuccess) {
53  throw std::runtime_error("Failed to query occupancy.");
54  }
55 
56  // Limit block size. This has the effect of increasing the number of items processed by a
57  // single thread and reduces the impact of initialization overhead.
58  block_size = (block_size < 128 ? block_size : 128);
59  }
60 
61  dim3 grid(grid_size, 1, 1);
62  dim3 block(block_size, 1, 1);
63 
64  kernel::TensorForEach<Func, Rank, Params><<< grid, block >>>(size, params);
65  }
66 };
67 
69 
71 template <typename Func, int Rank, typename Params>
73 
75  TensorDiagonalForEach(Coord<Rank> size, Params params = Params(), int start = 0, int end = -1, int block_size = 128) {
76 
77  if (end < 0) {
78  end = size.min();
79  }
80 
81  dim3 block(block_size, 1, 1);
82  dim3 grid((end - start + block_size - 1) / block_size, 1, 1);
83 
84  kernel::TensorDiagonalForEach<Func, Rank, Params><<< grid, block >>>(size, params, start, end);
85  }
86 };
87 
88 
90 
91 template <typename Element, typename Func>
92 struct BlockForEach {
93 
96  Element *ptr,
97  size_t capacity,
98  typename Func::Params params = typename Func::Params(),
99  int grid_size = 0,
100  int block_size = 0) {
101 
102  if (!grid_size || !block_size) {
103 
104  // if grid_size or block_size are zero, query occupancy using the CUDA Occupancy API
105  cudaError_t result = cudaOccupancyMaxPotentialBlockSize(
106  &grid_size,
107  &block_size,
108  reinterpret_cast<void const *>(kernel::BlockForEach<Element, Func>));
109 
110  if (result != cudaSuccess) {
111  throw std::runtime_error("Failed to query occupancy.");
112  }
113 
114  // Limit block size. This has the effect of increasing the number of items processed by a
115  // single thread and reduces the impact of initialization overhead.
116  block_size = (block_size < 128 ? block_size : 128);
117  }
118 
119  dim3 grid(grid_size, 1, 1);
120  dim3 block(block_size, 1, 1);
121 
122  kernel::BlockForEach<Element, Func><<< grid, block >>>(ptr, capacity, params);
123  }
124 };
125 
127 
128 } // namespace device
129 } // namespace reference
130 } // namesace cutlass
Definition: aligned_buffer.h:35
+Go to the documentation of this file.
1 /***************************************************************************************************
2  * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without modification, are permitted
5  * provided that the following conditions are met:
6  * * Redistributions of source code must retain the above copyright notice, this list of
7  * conditions and the following disclaimer.
8  * * Redistributions in binary form must reproduce the above copyright notice, this list of
9  * conditions and the following disclaimer in the documentation and/or other materials
10  * provided with the distribution.
11  * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
12  * to endorse or promote products derived from this software without specific prior written
13  * permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
20  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
21  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
22  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23  *
24  **************************************************************************************************/
25 #pragma once
26 
27 #include <stdexcept>
28 #include "cutlass/cutlass.h"
30 
31 namespace cutlass {
32 namespace reference {
33 namespace device {
34 
36 
38 template <typename Func, int Rank, typename Params>
39 struct TensorForEach {
40 
42  TensorForEach(Coord<Rank> size, Params params = Params(), int grid_size = 0, int block_size = 0) {
43 
44  if (!grid_size || !block_size) {
45 
46  // if grid_size or block_size are zero, query occupancy using the CUDA Occupancy API
47  cudaError_t result = cudaOccupancyMaxPotentialBlockSize(
48  &grid_size,
49  &block_size,
50  reinterpret_cast<void const *>(kernel::TensorForEach<Func, Rank, Params>));
51 
52  if (result != cudaSuccess) {
53  throw std::runtime_error("Failed to query occupancy.");
54  }
55 
56  // Limit block size. This has the effect of increasing the number of items processed by a
57  // single thread and reduces the impact of initialization overhead.
58  block_size = (block_size < 128 ? block_size : 128);
59  }
60 
61  dim3 grid(grid_size, 1, 1);
62  dim3 block(block_size, 1, 1);
63 
64  kernel::TensorForEach<Func, Rank, Params><<< grid, block >>>(size, params);
65  }
66 };
67 
69 
71 template <typename Func, int Rank, typename Params>
73 
75  TensorDiagonalForEach(Coord<Rank> size, Params params = Params(), int start = 0, int end = -1, int block_size = 128) {
76 
77  if (end < 0) {
78  end = size.min();
79  }
80 
81  dim3 block(block_size, 1, 1);
82  dim3 grid((end - start + block_size - 1) / block_size, 1, 1);
83 
84  kernel::TensorDiagonalForEach<Func, Rank, Params><<< grid, block >>>(size, params, start, end);
85  }
86 };
87 
88 
90 
91 template <typename Element, typename Func>
92 struct BlockForEach {
93 
96  Element *ptr,
97  size_t capacity,
98  typename Func::Params params = typename Func::Params(),
99  int grid_size = 0,
100  int block_size = 0) {
101 
102  if (!grid_size || !block_size) {
103 
104  // if grid_size or block_size are zero, query occupancy using the CUDA Occupancy API
105  cudaError_t result = cudaOccupancyMaxPotentialBlockSize(
106  &grid_size,
107  &block_size,
108  reinterpret_cast<void const *>(kernel::BlockForEach<Element, Func>));
109 
110  if (result != cudaSuccess) {
111  throw std::runtime_error("Failed to query occupancy.");
112  }
113 
114  // Limit block size. This has the effect of increasing the number of items processed by a
115  // single thread and reduces the impact of initialization overhead.
116  block_size = (block_size < 128 ? block_size : 128);
117  }
118 
119  dim3 grid(grid_size, 1, 1);
120  dim3 block(block_size, 1, 1);
121 
122  kernel::BlockForEach<Element, Func><<< grid, block >>>(ptr, capacity, params);
123  }
124 };
125 
127 
128 } // namespace device
129 } // namespace reference
130 } // namespace cutlass
Definition: aligned_buffer.h:35
TensorDiagonalForEach(Coord< Rank > size, Params params=Params(), int start=0, int end=-1, int block_size=128)
Constructor performs the operation.
Definition: device/tensor_foreach.h:75
TensorForEach(Coord< Rank > size, Params params=Params(), int grid_size=0, int block_size=0)
Constructor performs the operation.
Definition: device/tensor_foreach.h:42
Launches a kernel calling a functor for each element along a tensor&#39;s diagonal.
Definition: device/tensor_foreach.h:72
diff --git a/docs/functions_func_s.html b/docs/functions_func_s.html index 78581310..548b6d97 100644 --- a/docs/functions_func_s.html +++ b/docs/functions_func_s.html @@ -141,7 +141,7 @@

- s -

-Go to the documentation of this file.
1 /***************************************************************************************************
2  * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without modification, are permitted
5  * provided that the following conditions are met:
6  * * Redistributions of source code must retain the above copyright notice, this list of
7  * conditions and the following disclaimer.
8  * * Redistributions in binary form must reproduce the above copyright notice, this list of
9  * conditions and the following disclaimer in the documentation and/or other materials
10  * provided with the distribution.
11  * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
12  * to endorse or promote products derived from this software without specific prior written
13  * permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
20  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
21  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
22  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23  *
24  **************************************************************************************************/
29 #pragma once
30 
31 #include "cutlass/cutlass.h"
32 #include "cutlass/array.h"
33 #include "cutlass/aligned_buffer.h"
35 
36 #include "cutlass/numeric_types.h"
37 #include "cutlass/matrix_shape.h"
38 
39 #include "cutlass/gemm/gemm.h"
41 
43 
44 namespace cutlass {
45 namespace gemm {
46 namespace threadblock {
47 
49 
51 template <
53  typename Shape_,
55  // (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
56  typename IteratorA_,
59  typename SmemIteratorA_,
61  // (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
62  typename IteratorB_,
65  typename SmemIteratorB_,
67  typename ElementC_,
69  typename LayoutC_,
71  typename Policy_,
73  typename TransformA_ = NumericArrayConverter<
74  typename SmemIteratorA_::Element,
75  typename IteratorA_::Element,
76  IteratorA_::Fragment::kElements>,
79  typename TransformB_ = NumericArrayConverter<
80  typename SmemIteratorB_::Element,
81  typename IteratorB_::Element,
82  IteratorB_::Fragment::kElements>,
84  typename Enable = bool
85 >
86 class MmaPipelined : public MmaBase<Shape_, Policy_, 2> {
87 public:
88 
91 
92  using Shape = Shape_;
93  using IteratorA = IteratorA_;
94  using IteratorB = IteratorB_;
95  using ElementC = ElementC_;
96  using LayoutC = LayoutC_;
97  using Policy = Policy_;
98 
99  using SmemIteratorA = SmemIteratorA_;
100  using SmemIteratorB = SmemIteratorB_;
101 
102  using TransformA = TransformA_;
103  using TransformB = TransformB_;
104 
105  //
106  // Dependent types
107  //
108 
110  using FragmentA = typename IteratorA::Fragment;
111 
113  using FragmentB = typename IteratorB::Fragment;
114 
116  using FragmentC = typename Policy::Operator::FragmentC;
117 
119  using Operator = typename Policy::Operator;
120 
121  // staticaly assert kStages for MmaPipelined is two (Double-buffered pipeline)
122  static_assert((Base::kStages==2), "MmaPipelined requires kStages set to value 2");
123 
124 private:
125 
126  using WarpFragmentA = typename Operator::FragmentA;
127  using WarpFragmentB = typename Operator::FragmentB;
128 
129 protected:
130 
133 
136 
137 public:
138 
140  CUTLASS_DEVICE
142  typename Base::SharedStorage &shared_storage,
143  int thread_idx,
144  int warp_idx,
145  int lane_idx
146  ):
147  Base(shared_storage, thread_idx, warp_idx, lane_idx),
148  smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
149  smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx) {
150 
151  // Compute warp location within threadblock tile by mapping the warp_id to
152  // three coordinates:
153  // _m: the warp's position within the threadblock along the M dimension
154  // _n: the warp's position within the threadblock along the N dimension
155  // _k: the warp's position within the threadblock along the K dimension
156 
157  int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
158  int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
159 
160  int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
161  int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
162 
163  // Add per-warp offsets in units of warp-level tiles
164  this->warp_tile_iterator_A_.add_tile_offset({warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
165  this->warp_tile_iterator_B_.add_tile_offset({Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
166  }
167 
169  CUTLASS_DEVICE
171  int gemm_k_iterations,
172  FragmentC &accum,
173  IteratorA iterator_A,
174  IteratorB iterator_B,
175  FragmentC const &src_accum,
176  TransformA transform_A = TransformA(),
177  TransformB transform_B = TransformB()) {
178 
179  //
180  // Prologue
181  //
182 
183  // Perform accumulation in the 'd' output operand
184  accum = src_accum;
185 
186  FragmentA tb_frag_A;
187  FragmentB tb_frag_B;
188 
189  tb_frag_A.clear();
190  tb_frag_B.clear();
191 
192  // The last kblock is loaded in the prolog
193  iterator_A.load(tb_frag_A);
194  iterator_B.load(tb_frag_B);
195 
196  ++iterator_A;
197  ++iterator_B;
198 
199  this->smem_iterator_A_.store(transform_A(tb_frag_A));
200  this->smem_iterator_B_.store(transform_B(tb_frag_B));
201 
202  ++this->smem_iterator_A_;
203  ++this->smem_iterator_B_;
204 
205  __syncthreads();
206 
207  // Pair of fragments used to overlap shared memory loads and math instructions
208  WarpFragmentA warp_frag_A[2];
209  WarpFragmentB warp_frag_B[2];
210 
211  this->warp_tile_iterator_A_.set_kgroup_index(0);
212  this->warp_tile_iterator_B_.set_kgroup_index(0);
213 
214  this->warp_tile_iterator_A_.load(warp_frag_A[0]);
215  this->warp_tile_iterator_B_.load(warp_frag_B[0]);
216 
217  ++this->warp_tile_iterator_A_;
218  ++this->warp_tile_iterator_B_;
219 
220  Operator warp_mma;
221 
222  int smem_write_stage_idx = 1;
223 
224  // Avoid reading out of bounds
225  if (gemm_k_iterations <= 1) {
226  iterator_A.clear_mask();
227  iterator_B.clear_mask();
228  }
229 
230  // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing
231  // shared memory loads (which have the tighest latency requirement).
232 
233  //
234  // Mainloop
235  //
236 
237  // Note: The main loop does not support Base::kWarpGemmIterations == 2.
239  for (; gemm_k_iterations > 0; --gemm_k_iterations) {
240  //
241  // Loop over GEMM K dimension
242  //
243 
245  for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; ++warp_mma_k) {
246 
247  // Load warp-level tiles from shared memory, wrapping to k offset if this is the last group
248  // as the case may be.
249 
250  if (warp_mma_k == Base::kWarpGemmIterations - 1) {
251 
252  // Write fragments to shared memory
253  this->smem_iterator_A_.store(transform_A(tb_frag_A));
254 
255  this->smem_iterator_B_.store(transform_B(tb_frag_B));
256 
257  __syncthreads();
258 
259  ++this->smem_iterator_B_;
260  ++this->smem_iterator_A_;
261 
262  // Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory
263  if (smem_write_stage_idx == 1) {
264  this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
265  this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
266  }
267  else {
268  this->warp_tile_iterator_A_.add_tile_offset(
269  {0, -Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations});
270  this->warp_tile_iterator_B_.add_tile_offset(
271  {-Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations,
272  0});
273  }
274 
275  smem_write_stage_idx ^= 1;
276  }
277 
278  this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
279  this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
280 
281  this->warp_tile_iterator_A_.load(warp_frag_A[(warp_mma_k + 1) % 2]);
282  this->warp_tile_iterator_B_.load(warp_frag_B[(warp_mma_k + 1) % 2]);
283 
284  ++this->warp_tile_iterator_A_;
285  ++this->warp_tile_iterator_B_;
286 
287  if (warp_mma_k == 0) {
288 
289  iterator_A.load(tb_frag_A);
290  iterator_B.load(tb_frag_B);
291 
292  ++iterator_A;
293  ++iterator_B;
294 
295  // Avoid reading out of bounds if this was the last loop iteration
296  if (gemm_k_iterations <= 2) {
297  iterator_A.clear_mask();
298  iterator_B.clear_mask();
299  }
300  }
301 
302  warp_mma(accum, warp_frag_A[warp_mma_k % 2], warp_frag_B[warp_mma_k % 2], accum);
303  }
304  }
305 
306  }
307 };
308 
310 
311 } // namespace threadblock
312 } // namespace gemm
313 } // namespace cutlass
static int const kM
Definition: include/cutlass/gemm/gemm.h:58
+Go to the documentation of this file.
1 /***************************************************************************************************
2  * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without modification, are permitted
5  * provided that the following conditions are met:
6  * * Redistributions of source code must retain the above copyright notice, this list of
7  * conditions and the following disclaimer.
8  * * Redistributions in binary form must reproduce the above copyright notice, this list of
9  * conditions and the following disclaimer in the documentation and/or other materials
10  * provided with the distribution.
11  * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
12  * to endorse or promote products derived from this software without specific prior written
13  * permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
20  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
21  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
22  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23  *
24  **************************************************************************************************/
29 #pragma once
30 
31 #include "cutlass/cutlass.h"
32 #include "cutlass/array.h"
33 #include "cutlass/aligned_buffer.h"
35 
36 #include "cutlass/numeric_types.h"
37 #include "cutlass/matrix_shape.h"
38 
39 #include "cutlass/gemm/gemm.h"
41 
43 
44 namespace cutlass {
45 namespace gemm {
46 namespace threadblock {
47 
49 
51 template <
53  typename Shape_,
55  // (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
56  typename IteratorA_,
59  typename SmemIteratorA_,
61  // (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
62  typename IteratorB_,
65  typename SmemIteratorB_,
67  typename ElementC_,
69  typename LayoutC_,
71  typename Policy_,
73  typename TransformA_ = NumericArrayConverter<
74  typename SmemIteratorA_::Element,
75  typename IteratorA_::Element,
76  IteratorA_::Fragment::kElements>,
79  typename TransformB_ = NumericArrayConverter<
80  typename SmemIteratorB_::Element,
81  typename IteratorB_::Element,
82  IteratorB_::Fragment::kElements>,
84  typename Enable = bool
85 >
86 class MmaPipelined : public MmaBase<Shape_, Policy_, 2> {
87 public:
88 
91 
92  using Shape = Shape_;
93  using IteratorA = IteratorA_;
94  using IteratorB = IteratorB_;
95  using ElementC = ElementC_;
96  using LayoutC = LayoutC_;
97  using Policy = Policy_;
98 
99  using SmemIteratorA = SmemIteratorA_;
100  using SmemIteratorB = SmemIteratorB_;
101 
102  using TransformA = TransformA_;
103  using TransformB = TransformB_;
104 
105  //
106  // Dependent types
107  //
108 
110  using FragmentA = typename IteratorA::Fragment;
111 
113  using FragmentB = typename IteratorB::Fragment;
114 
116  using FragmentC = typename Policy::Operator::FragmentC;
117 
119  using Operator = typename Policy::Operator;
120 
121  // staticaly assert kStages for MmaPipelined is two (Double-buffered pipeline)
122  static_assert((Base::kStages==2), "MmaPipelined requires kStages set to value 2");
123 
124 private:
125 
126  using WarpFragmentA = typename Operator::FragmentA;
127  using WarpFragmentB = typename Operator::FragmentB;
128 
129 protected:
130 
133 
136 
137 public:
138 
140  CUTLASS_DEVICE
142  typename Base::SharedStorage &shared_storage,
143  int thread_idx,
144  int warp_idx,
145  int lane_idx
146  ):
147  Base(shared_storage, thread_idx, warp_idx, lane_idx),
148  smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
149  smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx) {
150 
151  // Compute warp location within threadblock tile by mapping the warp_id to
152  // three coordinates:
153  // _m: the warp's position within the threadblock along the M dimension
154  // _n: the warp's position within the threadblock along the N dimension
155  // _k: the warp's position within the threadblock along the K dimension
156 
157  int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
158  int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
159 
160  int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
161  int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
162 
163  // Add per-warp offsets in units of warp-level tiles
164  this->warp_tile_iterator_A_.add_tile_offset({warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
165  this->warp_tile_iterator_B_.add_tile_offset({Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
166  }
167 
169  CUTLASS_DEVICE
171  int gemm_k_iterations,
172  FragmentC &accum,
173  IteratorA iterator_A,
174  IteratorB iterator_B,
175  FragmentC const &src_accum,
176  TransformA transform_A = TransformA(),
177  TransformB transform_B = TransformB()) {
178 
179  //
180  // Prologue
181  //
182 
183  // Perform accumulation in the 'd' output operand
184  accum = src_accum;
185 
186  FragmentA tb_frag_A;
187  FragmentB tb_frag_B;
188 
189  tb_frag_A.clear();
190  tb_frag_B.clear();
191 
192  // The last kblock is loaded in the prolog
193  iterator_A.load(tb_frag_A);
194  iterator_B.load(tb_frag_B);
195 
196  ++iterator_A;
197  ++iterator_B;
198 
199  this->smem_iterator_A_.store(transform_A(tb_frag_A));
200  this->smem_iterator_B_.store(transform_B(tb_frag_B));
201 
202  ++this->smem_iterator_A_;
203  ++this->smem_iterator_B_;
204 
205  __syncthreads();
206 
207  // Pair of fragments used to overlap shared memory loads and math instructions
208  WarpFragmentA warp_frag_A[2];
209  WarpFragmentB warp_frag_B[2];
210 
211  this->warp_tile_iterator_A_.set_kgroup_index(0);
212  this->warp_tile_iterator_B_.set_kgroup_index(0);
213 
214  this->warp_tile_iterator_A_.load(warp_frag_A[0]);
215  this->warp_tile_iterator_B_.load(warp_frag_B[0]);
216 
217  ++this->warp_tile_iterator_A_;
218  ++this->warp_tile_iterator_B_;
219 
220  Operator warp_mma;
221 
222  int smem_write_stage_idx = 1;
223 
224  // Avoid reading out of bounds
225  if (gemm_k_iterations <= 1) {
226  iterator_A.clear_mask();
227  iterator_B.clear_mask();
228  }
229 
230  // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing
231  // shared memory loads (which have the tightest latency requirement).
232 
233  //
234  // Mainloop
235  //
236 
237  // Note: The main loop does not support Base::kWarpGemmIterations == 2.
239  for (; gemm_k_iterations > 0; --gemm_k_iterations) {
240  //
241  // Loop over GEMM K dimension
242  //
243 
245  for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; ++warp_mma_k) {
246 
247  // Load warp-level tiles from shared memory, wrapping to k offset if this is the last group
248  // as the case may be.
249 
250  if (warp_mma_k == Base::kWarpGemmIterations - 1) {
251 
252  // Write fragments to shared memory
253  this->smem_iterator_A_.store(transform_A(tb_frag_A));
254 
255  this->smem_iterator_B_.store(transform_B(tb_frag_B));
256 
257  __syncthreads();
258 
259  ++this->smem_iterator_B_;
260  ++this->smem_iterator_A_;
261 
262  // Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory
263  if (smem_write_stage_idx == 1) {
264  this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
265  this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
266  }
267  else {
268  this->warp_tile_iterator_A_.add_tile_offset(
269  {0, -Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations});
270  this->warp_tile_iterator_B_.add_tile_offset(
271  {-Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations,
272  0});
273  }
274 
275  smem_write_stage_idx ^= 1;
276  }
277 
278  this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
279  this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
280 
281  this->warp_tile_iterator_A_.load(warp_frag_A[(warp_mma_k + 1) % 2]);
282  this->warp_tile_iterator_B_.load(warp_frag_B[(warp_mma_k + 1) % 2]);
283 
284  ++this->warp_tile_iterator_A_;
285  ++this->warp_tile_iterator_B_;
286 
287  if (warp_mma_k == 0) {
288 
289  iterator_A.load(tb_frag_A);
290  iterator_B.load(tb_frag_B);
291 
292  ++iterator_A;
293  ++iterator_B;
294 
295  // Avoid reading out of bounds if this was the last loop iteration
296  if (gemm_k_iterations <= 2) {
297  iterator_A.clear_mask();
298  iterator_B.clear_mask();
299  }
300  }
301 
302  warp_mma(accum, warp_frag_A[warp_mma_k % 2], warp_frag_B[warp_mma_k % 2], accum);
303  }
304  }
305 
306  }
307 };
308 
310 
311 } // namespace threadblock
312 } // namespace gemm
313 } // namespace cutlass
static int const kM
Definition: include/cutlass/gemm/gemm.h:58
LayoutC_ LayoutC
Layout of accumulator matrix.
Definition: mma_pipelined.h:96
TransformB_ TransformB
Definition: mma_pipelined.h:103
Definition: aligned_buffer.h:35
diff --git a/docs/namespacecutlass_1_1gemm_1_1device.html b/docs/namespacecutlass_1_1gemm_1_1device.html index 7023f4f8..c0b27cbe 100644 --- a/docs/namespacecutlass_1_1gemm_1_1device.html +++ b/docs/namespacecutlass_1_1gemm_1_1device.html @@ -134,17 +134,17 @@ class  Gemm   class  Gemm< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial, Operator_, IsBetaZero > - Parital specialization for column-major output exchanges problem size and operand. More...
+ Partial specialization for column-major output exchanges problem size and operand. More...
  class  GemmBatched   class  GemmBatched< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, Operator_ > - Parital specialization for column-major output exchanges problem size and operand. More...
+ Partial specialization for column-major output exchanges problem size and operand. More...
  class  GemmComplex   class  GemmComplex< ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, layout::ColumnMajor, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_, WarpShape_, InstructionShape_, EpilogueOutputOp_, ThreadblockSwizzle_, Stages, TransformA, TransformB, SplitKSerial > - Parital specialization for column-major output exchanges problem size and operand. More...
+ Partial specialization for column-major output exchanges problem size and operand. More...
  class  GemmSplitKParallel   diff --git a/docs/namespacecutlass_1_1reference_1_1device.html b/docs/namespacecutlass_1_1reference_1_1device.html index 86f21a00..54f5009f 100644 --- a/docs/namespacecutlass_1_1reference_1_1device.html +++ b/docs/namespacecutlass_1_1reference_1_1device.html @@ -125,7 +125,7 @@  Partial specialization for multiply-add-saturate. More...
  struct  Gemm< ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, AccumulatorType, arch::OpXorPopc > - Parital specialization for XOR-popc. More...
+ Partial specialization for XOR-popc. More...
  struct  TensorDiagonalForEach  Launches a kernel calling a functor for each element along a tensor's diagonal. More...
@@ -183,7 +183,7 @@   template<typename Element , typename Layout > void TensorFillIdentity (TensorView< Element, Layout > view) - Fills a tensor's digonal with 1 and 0 everywhere else. More...
+ Fills a tensor's diagonal with 1 and 0 everywhere else. More...
  template<typename Element , typename Layout > void TensorUpdateDiagonal (TensorView< Element, Layout > view, Element diag=Element(1)) diff --git a/docs/namespacecutlass_1_1reference_1_1host.html b/docs/namespacecutlass_1_1reference_1_1host.html index 90f9a014..6d07d1f2 100644 --- a/docs/namespacecutlass_1_1reference_1_1host.html +++ b/docs/namespacecutlass_1_1reference_1_1host.html @@ -122,7 +122,7 @@  Partial specialization for multiply-add-saturate. More...
  struct  Gemm< ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, arch::OpXorPopc > - Parital specialization for XOR-popc. More...
+ Partial specialization for XOR-popc. More...
  - + diff --git a/docs/search/all_12.js b/docs/search/all_12.js index 0e091040..c9f8a45c 100644 --- a/docs/search/all_12.js +++ b/docs/search/all_12.js @@ -14,7 +14,7 @@ var searchData= ['semaphore',['Semaphore',['../classcutlass_1_1Semaphore.html',1,'cutlass']]], ['semaphore',['Semaphore',['../classcutlass_1_1Semaphore.html#a2ce4cd07fe773efa429f726cfbd98070',1,'cutlass::Semaphore::Semaphore()'],['../structcutlass_1_1gemm_1_1kernel_1_1Gemm_1_1Params.html#adec6d0c6d74e7f456196f453e302fbbb',1,'cutlass::gemm::kernel::Gemm::Params::semaphore()']]], ['semaphore_2eh',['semaphore.h',['../semaphore_8h.html',1,'']]], - ['seperate_5fstring',['seperate_string',['../structcutlass_1_1CommandLine.html#a5f86e4b2bd8c44b739c83530d77c5590',1,'cutlass::CommandLine']]], + ['separate_5fstring',['separate_string',['../structcutlass_1_1CommandLine.html#a5f86e4b2bd8c44b739c83530d77c5590',1,'cutlass::CommandLine']]], ['sequential',['sequential',['../structcutlass_1_1Distribution.html#ab86d975567ef141ff82067b1f41cd3ee',1,'cutlass::Distribution::sequential()'],['../structcutlass_1_1Distribution.html#a499f4023e0d42356ce71d38cc32bf92aa39d3cf55e90573c8d1dfb483cfb410dc',1,'cutlass::Distribution::Sequential()']]], ['set',['set',['../classcutlass_1_1PredicateVector_1_1Iterator.html#aadfd039b5622098c9e46706a27122575',1,'cutlass::PredicateVector::Iterator::set()'],['../structcutlass_1_1PredicateVector.html#a062fa8a8df725ef08ced2ffcca8336af',1,'cutlass::PredicateVector::set()'],['../classcutlass_1_1SubbyteReference.html#a6473e57520d8ee7afbd95c1e1641e05a',1,'cutlass::SubbyteReference::set()']]], ['set_5fgaussian',['set_gaussian',['../structcutlass_1_1Distribution.html#ad594b5ec1d577e8ef03d4d808a8220b1',1,'cutlass::Distribution']]], diff --git a/docs/search/functions_12.js b/docs/search/functions_12.js index f2b3bff9..6648b431 100644 --- a/docs/search/functions_12.js +++ b/docs/search/functions_12.js @@ -3,7 +3,7 @@ var searchData= ['scalar_5fop',['scalar_op',['../structcutlass_1_1minimum_3_01Array_3_01T_00_01N_01_4_01_4.html#a4b42227184cb7c796460062c46a84b57',1,'cutlass::minimum< Array< T, N > >']]], ['scalario',['ScalarIO',['../structcutlass_1_1ScalarIO.html#ad4166575521254088bf6c6300c351714',1,'cutlass::ScalarIO::ScalarIO()'],['../structcutlass_1_1ScalarIO.html#a5227e1e9ed24326ad4f8dc94d186186f',1,'cutlass::ScalarIO::ScalarIO(T value)']]], ['semaphore',['Semaphore',['../classcutlass_1_1Semaphore.html#a2ce4cd07fe773efa429f726cfbd98070',1,'cutlass::Semaphore']]], - ['seperate_5fstring',['seperate_string',['../structcutlass_1_1CommandLine.html#a5f86e4b2bd8c44b739c83530d77c5590',1,'cutlass::CommandLine']]], + ['separate_5fstring',['separate_string',['../structcutlass_1_1CommandLine.html#a5f86e4b2bd8c44b739c83530d77c5590',1,'cutlass::CommandLine']]], ['set',['set',['../classcutlass_1_1PredicateVector_1_1Iterator.html#aadfd039b5622098c9e46706a27122575',1,'cutlass::PredicateVector::Iterator::set()'],['../structcutlass_1_1PredicateVector.html#a062fa8a8df725ef08ced2ffcca8336af',1,'cutlass::PredicateVector::set()'],['../classcutlass_1_1SubbyteReference.html#a6473e57520d8ee7afbd95c1e1641e05a',1,'cutlass::SubbyteReference::set()']]], ['set_5fgaussian',['set_gaussian',['../structcutlass_1_1Distribution.html#ad594b5ec1d577e8ef03d4d808a8220b1',1,'cutlass::Distribution']]], ['set_5fidentity',['set_identity',['../structcutlass_1_1Distribution.html#aad2cf02af3d520544d89843cc4295858',1,'cutlass::Distribution']]], diff --git a/docs/structcutlass_1_1CommandLine-members.html b/docs/structcutlass_1_1CommandLine-members.html index 77668951..6a17b2f0 100644 --- a/docs/structcutlass_1_1CommandLine-members.html +++ b/docs/structcutlass_1_1CommandLine-members.html @@ -115,7 +115,7 @@ - + diff --git a/docs/structcutlass_1_1CommandLine.html b/docs/structcutlass_1_1CommandLine.html index 0bde0ec4..01cd35a2 100644 --- a/docs/structcutlass_1_1CommandLine.html +++ b/docs/structcutlass_1_1CommandLine.html @@ -151,7 +151,7 @@ - +

@@ -247,7 +247,7 @@

 
template<typename Element , typename Layout >
void TensorFillIdentity (TensorView< Element, Layout > dst)
 Helper to fill a tensor's digonal with 1 and 0 everywhere else. More...
 Helper to fill a tensor's diagonal with 1 and 0 everywhere else. More...
 
template<typename Element , typename Layout >
void TensorUpdateDiagonal (TensorView< Element, Layout > dst, Element val=Element(1))
keyscutlass::CommandLine
num_naked_args() const cutlass::CommandLineinline
parsed_argc() const cutlass::CommandLineinline
seperate_string(std::string const &str, std::vector< value_t > &vals, char sep= ',')cutlass::CommandLineinlinestatic
separate_string(std::string const &str, std::vector< value_t > &vals, char sep= ',')cutlass::CommandLineinlinestatic
tokenize(std::vector< std::pair< std::string, std::string > > &tokens, std::string const &str, char delim= ',', char sep= ':')cutlass::CommandLineinlinestatic
tokenize(std::vector< std::string > &tokens, std::string const &str, char delim= ',', char sep= ':')cutlass::CommandLineinlinestatic
valuescutlass::CommandLine
 Tokenizes a comma-delimited list of string pairs delimited by ':'. More...
 
template<typename value_t >
static void seperate_string (std::string const &str, std::vector< value_t > &vals, char sep= ',')
static void separate_string (std::string const &str, std::vector< value_t > &vals, char sep= ',')
 
- +

@@ -548,7 +548,7 @@

Member Function Documentation

- + diff --git a/docs/structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout660562b232f408218828ca5915b7e73a.html b/docs/structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout660562b232f408218828ca5915b7e73a.html index 37cb3e5d..2f4bf08e 100644 --- a/docs/structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout660562b232f408218828ca5915b7e73a.html +++ b/docs/structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout660562b232f408218828ca5915b7e73a.html @@ -104,7 +104,7 @@
-

Parital specialization for XOR-popc. +

Partial specialization for XOR-popc.

#include <gemm.h>

diff --git a/docs/structcutlass_1_1reference_1_1device_1_1kernel_1_1detail_1_1TensorForEachHelper_3_01Func_00_01Rank_00_010_01_4.html b/docs/structcutlass_1_1reference_1_1device_1_1kernel_1_1detail_1_1TensorForEachHelper_3_01Func_00_01Rank_00_010_01_4.html index 2c89af68..2daeadcc 100644 --- a/docs/structcutlass_1_1reference_1_1device_1_1kernel_1_1detail_1_1TensorForEachHelper_3_01Func_00_01Rank_00_010_01_4.html +++ b/docs/structcutlass_1_1reference_1_1device_1_1kernel_1_1detail_1_1TensorForEachHelper_3_01Func_00_01Rank_00_010_01_4.html @@ -112,7 +112,7 @@
- +
static void cutlass::CommandLine::seperate_string static void cutlass::CommandLine::separate_string ( std::string const &  str,

Public Member Functions

__inline__ __device__ TensorForEachHelper (Func &func, Coord< Rank > const &size, Coord< Rank > &coord, int64_t index)
 Constructor for fastest chaning rank. More...
 Constructor for fastest changing rank. More...
 

Constructor & Destructor Documentation

diff --git a/docs/structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_4f3f32c4b336238abfd741e87bfced46.html b/docs/structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_4f3f32c4b336238abfd741e87bfced46.html index 0840df59..7a83a97d 100644 --- a/docs/structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_4f3f32c4b336238abfd741e87bfced46.html +++ b/docs/structcutlass_1_1reference_1_1host_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01LayoutB_4f3f32c4b336238abfd741e87bfced46.html @@ -104,7 +104,7 @@
-

Parital specialization for XOR-popc. +

Partial specialization for XOR-popc.

#include <gemm.h>

diff --git a/docs/structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper_3_01Func_00_01Rank_00_010_01_4.html b/docs/structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper_3_01Func_00_01Rank_00_010_01_4.html index 2e440e45..6c63b40e 100644 --- a/docs/structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper_3_01Func_00_01Rank_00_010_01_4.html +++ b/docs/structcutlass_1_1reference_1_1host_1_1detail_1_1TensorForEachHelper_3_01Func_00_01Rank_00_010_01_4.html @@ -113,7 +113,7 @@

Public Member Functions

 TensorForEachHelper (Func &func, Coord< Rank > const &extent, Coord< Rank > &coord)
 Constructor for fastest chaning rank. More...
 Constructor for fastest changing rank. More...
 
- +

diff --git a/docs/tools_2util_2include_2cutlass_2util_2reference_2device_2gemm_8h.html b/docs/tools_2util_2include_2cutlass_2util_2reference_2device_2gemm_8h.html index 2a0a978e..cc752855 100644 --- a/docs/tools_2util_2include_2cutlass_2util_2reference_2device_2gemm_8h.html +++ b/docs/tools_2util_2include_2cutlass_2util_2reference_2device_2gemm_8h.html @@ -134,7 +134,7 @@

 Partial specialization for multiply-add-saturate. More...
 
struct  cutlass::reference::device::Gemm< ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, AccumulatorType, arch::OpXorPopc >
 Parital specialization for XOR-popc. More...
 Partial specialization for XOR-popc. More...
 
- +

diff --git a/docs/tools_2util_2include_2cutlass_2util_2reference_2host_2gemm_8h.html b/docs/tools_2util_2include_2cutlass_2util_2reference_2host_2gemm_8h.html index d20a0784..b0bfdbc2 100644 --- a/docs/tools_2util_2include_2cutlass_2util_2reference_2host_2gemm_8h.html +++ b/docs/tools_2util_2include_2cutlass_2util_2reference_2host_2gemm_8h.html @@ -141,7 +141,7 @@

 Partial specialization for multiply-add-saturate. More...
 
struct  cutlass::reference::host::Gemm< ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType, ComputeType, arch::OpXorPopc >
 Parital specialization for XOR-popc. More...
 Partial specialization for XOR-popc. More...
 

diff --git a/docs/wmma__sm75_8h_source.html b/docs/wmma__sm75_8h_source.html index 72ad72f9..6ff6405d 100644 --- a/docs/wmma__sm75_8h_source.html +++ b/docs/wmma__sm75_8h_source.html @@ -98,7 +98,7 @@
wmma_sm75.h
-Go to the documentation of this file.
1 /***************************************************************************************************
2  * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without modification, are permitted
5  * provided that the following conditions are met:
6  * * Redistributions of source code must retain the above copyright notice, this list of
7  * conditions and the following disclaimer.
8  * * Redistributions in binary form must reproduce the above copyright notice, this list of
9  * conditions and the following disclaimer in the documentation and/or other materials
10  * provided with the distribution.
11  * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
12  * to endorse or promote products derived from this software without specific prior written
13  * permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
20  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
21  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
22  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23  *
24  **************************************************************************************************/
29 #pragma once
30 
31 #include <assert.h>
32 #include "cutlass/layout/matrix.h"
33 
35 namespace cutlass {
36 namespace arch {
37 
39 //
40 // WMMA template structure defines nvcuda::wmma::fragments and static assert for
41 // wmma native instruction sizes supported for cutlass::int4b_t (experimental::s4).
42 //
44 template <
45 typename Shape_,
46 typename LayoutA_,
47 typename LayoutB_,
48 typename LayoutC_>
49 struct Wmma<
50  Shape_,
51  cutlass::int4b_t,
52  LayoutA_,
54  LayoutB_,
55  int32_t,
56  LayoutC_,
57  cutlass::arch::OpMultiplyAdd
58 > {
59 #if defined(CUTLASS_ARCH_WMMA_SM75_ENABLED)
60  using Shape = Shape_;
61  using ElementA = cutlass::int4b_t;
62  using LayoutA = LayoutA_;
63  using ElementB = cutlass::int4b_t;
64  using LayoutB = LayoutB_;
65  using ElementC = int32_t;
66  using LayoutC = LayoutC_;
67  using Operator = cutlass::arch::OpMultiplyAdd;
68 
69  // check supported wmma shape for the given multiplicand data types
72  "Supported list of wmma operator shape for s8 multiplicands is: 8x8x32");
73 
74 
75  // Wmma Fragment
76  using FragmentA = nvcuda::wmma::fragment<
77  nvcuda::wmma::matrix_a,
78  Shape::kM,
79  Shape::kN,
80  Shape::kK,
81  typename CutlassToWmmaDataType<ElementA>::Type,
82  typename CutlassToWmmaLayout<LayoutA>::Layout>;
83 
84  using FragmentB = nvcuda::wmma::fragment<
85  nvcuda::wmma::matrix_b,
86  Shape::kM,
87  Shape::kN,
88  Shape::kK,
89  typename CutlassToWmmaDataType<ElementB>::Type,
90  typename CutlassToWmmaLayout<LayoutB>::Layout>;
91 
92  using FragmentC = nvcuda::wmma::fragment<
93  nvcuda::wmma::accumulator,
94  Shape::kM,
95  Shape::kN,
96  Shape::kK,
97  typename CutlassToWmmaDataType<ElementC>::Type>;
98 
100  CUTLASS_DEVICE
101  void operator()(
102  FragmentC &D,
103  FragmentA const &A,
104  FragmentB const &B,
105  FragmentC const &C) const {
106  nvcuda::wmma::mma_sync(D, A, B, C);
107  }
108 
109 #else
110  static_assert(false, "wmma.mma.sync interger type multiplicands is avialable only for SM75 and beyond");
111 #endif
112 
113 };
114 
116 //
117 // WMMA template structure defines nvcuda::wmma::fragments and static assert for
118 // wmma native instruction sizes supported for cutlass::uint1b_t (experimental::b1)
119 // (nvcuda::wmma targetting SASS instruction BMMA)
120 //
122 template <
123 typename Shape_,
124 typename LayoutA_,
125 typename LayoutB_,
126 typename LayoutC_>
127 struct Wmma<
128  Shape_,
129  cutlass::uint1b_t,
130  LayoutA_,
132  LayoutB_,
133  int32_t,
134  LayoutC_,
135  cutlass::arch::OpXorPopc
136 > {
137 #if defined(CUTLASS_ARCH_WMMA_SM75_ENABLED)
138  using Shape = Shape_;
139  using ElementA = cutlass::uint1b_t;
140  using LayoutA = LayoutA_;
141  using ElementB = cutlass::uint1b_t;
142  using LayoutB = LayoutB_;
143  using ElementC = int32_t;
144  using LayoutC = LayoutC_;
145  using Operator = cutlass::arch::OpXorPopc;
146 
147  // check supported wmma shape for the given multiplicand data types
150  "Supported list of wmma operator shape for b1 multiplicands is: 8x8x128");
151 
152 
153  // Wmma Fragment
154  using FragmentA = nvcuda::wmma::fragment<
155  nvcuda::wmma::matrix_a,
156  Shape::kM,
157  Shape::kN,
158  Shape::kK,
159  typename CutlassToWmmaDataType<ElementA>::Type,
160  typename CutlassToWmmaLayout<LayoutA>::Layout>;
161 
162  using FragmentB = nvcuda::wmma::fragment<
163  nvcuda::wmma::matrix_b,
164  Shape::kM,
165  Shape::kN,
166  Shape::kK,
167  typename CutlassToWmmaDataType<ElementB>::Type,
168  typename CutlassToWmmaLayout<LayoutB>::Layout>;
169 
170  using FragmentC = nvcuda::wmma::fragment<
171  nvcuda::wmma::accumulator,
172  Shape::kM,
173  Shape::kN,
174  Shape::kK,
175  typename CutlassToWmmaDataType<ElementC>::Type>;
176 
178  CUTLASS_DEVICE
179  void operator()(
180  FragmentC &D,
181  FragmentA const &A,
182  FragmentB const &B,
183  FragmentC const &C) const {
184 
185  nvcuda::wmma::bmma_sync(D, A, B, C, nvcuda::wmma::experimental::bmmaBitOpXOR,
186  nvcuda::wmma::experimental::bmmaAccumulateOpPOPC);
187  }
188 
189 #else
190  static_assert(false, "wmma.mma.sync interger type multiplicands is avialable only for SM75 and beyond");
191 #endif
192 
193 };
194 
195 } // namespace arch
196 } // namespace cutlass
Definition: aligned_buffer.h:35
+Go to the documentation of this file.
1 /***************************************************************************************************
2  * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without modification, are permitted
5  * provided that the following conditions are met:
6  * * Redistributions of source code must retain the above copyright notice, this list of
7  * conditions and the following disclaimer.
8  * * Redistributions in binary form must reproduce the above copyright notice, this list of
9  * conditions and the following disclaimer in the documentation and/or other materials
10  * provided with the distribution.
11  * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
12  * to endorse or promote products derived from this software without specific prior written
13  * permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
20  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
21  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
22  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23  *
24  **************************************************************************************************/
29 #pragma once
30 
31 #include <assert.h>
32 #include "cutlass/layout/matrix.h"
33 
35 namespace cutlass {
36 namespace arch {
37 
39 //
40 // WMMA template structure defines nvcuda::wmma::fragments and static assert for
41 // wmma native instruction sizes supported for cutlass::int4b_t (experimental::s4).
42 //
44 template <
45 typename Shape_,
46 typename LayoutA_,
47 typename LayoutB_,
48 typename LayoutC_>
49 struct Wmma<
50  Shape_,
51  cutlass::int4b_t,
52  LayoutA_,
54  LayoutB_,
55  int32_t,
56  LayoutC_,
57  cutlass::arch::OpMultiplyAdd
58 > {
59 #if defined(CUTLASS_ARCH_WMMA_SM75_ENABLED)
60  using Shape = Shape_;
61  using ElementA = cutlass::int4b_t;
62  using LayoutA = LayoutA_;
63  using ElementB = cutlass::int4b_t;
64  using LayoutB = LayoutB_;
65  using ElementC = int32_t;
66  using LayoutC = LayoutC_;
67  using Operator = cutlass::arch::OpMultiplyAdd;
68 
69  // check supported wmma shape for the given multiplicand data types
72  "Supported list of wmma operator shape for s8 multiplicands is: 8x8x32");
73 
74 
75  // Wmma Fragment
76  using FragmentA = nvcuda::wmma::fragment<
77  nvcuda::wmma::matrix_a,
78  Shape::kM,
79  Shape::kN,
80  Shape::kK,
81  typename CutlassToWmmaDataType<ElementA>::Type,
82  typename CutlassToWmmaLayout<LayoutA>::Layout>;
83 
84  using FragmentB = nvcuda::wmma::fragment<
85  nvcuda::wmma::matrix_b,
86  Shape::kM,
87  Shape::kN,
88  Shape::kK,
89  typename CutlassToWmmaDataType<ElementB>::Type,
90  typename CutlassToWmmaLayout<LayoutB>::Layout>;
91 
92  using FragmentC = nvcuda::wmma::fragment<
93  nvcuda::wmma::accumulator,
94  Shape::kM,
95  Shape::kN,
96  Shape::kK,
97  typename CutlassToWmmaDataType<ElementC>::Type>;
98 
100  CUTLASS_DEVICE
101  void operator()(
102  FragmentC &D,
103  FragmentA const &A,
104  FragmentB const &B,
105  FragmentC const &C) const {
106  nvcuda::wmma::mma_sync(D, A, B, C);
107  }
108 
109 #else
110  static_assert(false, "wmma.mma.sync interger type multiplicands is avialable only for SM75 and beyond");
111 #endif
112 
113 };
114 
116 //
117 // WMMA template structure defines nvcuda::wmma::fragments and static assert for
118 // wmma native instruction sizes supported for cutlass::uint1b_t (experimental::b1)
119 // (nvcuda::wmma targeting SASS instruction BMMA)
120 //
122 template <
123 typename Shape_,
124 typename LayoutA_,
125 typename LayoutB_,
126 typename LayoutC_>
127 struct Wmma<
128  Shape_,
129  cutlass::uint1b_t,
130  LayoutA_,
132  LayoutB_,
133  int32_t,
134  LayoutC_,
135  cutlass::arch::OpXorPopc
136 > {
137 #if defined(CUTLASS_ARCH_WMMA_SM75_ENABLED)
138  using Shape = Shape_;
139  using ElementA = cutlass::uint1b_t;
140  using LayoutA = LayoutA_;
141  using ElementB = cutlass::uint1b_t;
142  using LayoutB = LayoutB_;
143  using ElementC = int32_t;
144  using LayoutC = LayoutC_;
145  using Operator = cutlass::arch::OpXorPopc;
146 
147  // check supported wmma shape for the given multiplicand data types
150  "Supported list of wmma operator shape for b1 multiplicands is: 8x8x128");
151 
152 
153  // Wmma Fragment
154  using FragmentA = nvcuda::wmma::fragment<
155  nvcuda::wmma::matrix_a,
156  Shape::kM,
157  Shape::kN,
158  Shape::kK,
159  typename CutlassToWmmaDataType<ElementA>::Type,
160  typename CutlassToWmmaLayout<LayoutA>::Layout>;
161 
162  using FragmentB = nvcuda::wmma::fragment<
163  nvcuda::wmma::matrix_b,
164  Shape::kM,
165  Shape::kN,
166  Shape::kK,
167  typename CutlassToWmmaDataType<ElementB>::Type,
168  typename CutlassToWmmaLayout<LayoutB>::Layout>;
169 
170  using FragmentC = nvcuda::wmma::fragment<
171  nvcuda::wmma::accumulator,
172  Shape::kM,
173  Shape::kN,
174  Shape::kK,
175  typename CutlassToWmmaDataType<ElementC>::Type>;
176 
178  CUTLASS_DEVICE
179  void operator()(
180  FragmentC &D,
181  FragmentA const &A,
182  FragmentB const &B,
183  FragmentC const &C) const {
184 
185  nvcuda::wmma::bmma_sync(D, A, B, C, nvcuda::wmma::experimental::bmmaBitOpXOR,
186  nvcuda::wmma::experimental::bmmaAccumulateOpPOPC);
187  }
188 
189 #else
190  static_assert(false, "wmma.mma.sync interger type multiplicands is avialable only for SM75 and beyond");
191 #endif
192 
193 };
194 
195 } // namespace arch
196 } // namespace cutlass
Definition: aligned_buffer.h:35
std::is_same (false specialization)
Definition: platform.h:394
integer_subbyte< 1, false > uint1b_t
1-bit Unsigned integer type
Definition: integer_subbyte.h:152
4-bit signed integer type
Definition: integer_subbyte.h:42
diff --git a/examples/00_basic_gemm/basic_gemm.cu b/examples/00_basic_gemm/basic_gemm.cu index 57df36be..baa5e059 100644 --- a/examples/00_basic_gemm/basic_gemm.cu +++ b/examples/00_basic_gemm/basic_gemm.cu @@ -47,7 +47,7 @@ or utilities within CUTLASS. Such utilities are demonstrated elsewhere in other examples and are prevalent in the CUTLASS unit tests. - This example has delibrately been kept similar to the basic_gemm example from cutass-1.3 to + This example has delibrately been kept similar to the basic_gemm example from cutlass-1.3 to highlight the minimum amount of differences needed to transition to cutlass-2.0. Cutlass-1.3 sgemm: https://github.com/NVIDIA/cutlass/blob/master/examples/00_basic_gemm/basic_gemm.cu diff --git a/examples/07_volta_tensorop_gemm/volta_tensorop_gemm.cu b/examples/07_volta_tensorop_gemm/volta_tensorop_gemm.cu index c38f040d..6560f83d 100644 --- a/examples/07_volta_tensorop_gemm/volta_tensorop_gemm.cu +++ b/examples/07_volta_tensorop_gemm/volta_tensorop_gemm.cu @@ -75,7 +75,7 @@ Now that we setup the properties of data, we have to setup properties of computa Second, we create template variables of tile sizes for thread-block, warp and mma-op to 128x128x32, 64x64x32, 8x8x4 (MxNxK) respectively. When passed to instantiate CUTLASS GEMM kernel, it internally deduce the amount of threads needed per thread-block, amount of shared memory, storing data in -bank-conflict free manner, and ton of other variables required to compose, intialize and launch a +bank-conflict free manner, and ton of other variables required to compose, initialize and launch a high performance GEMM kernel. This is the beauty of CUTLASS, it relieves developer from understanding and coding complicated hardware optimizations which can easily go wrong. @@ -107,7 +107,7 @@ is done which threadblock launched on an SM, CUDA SM architecture of GPU you wan These are all put together to create a template variable which describes CUTLASS GEMM kernel using cutlass::gemm::device::Gemm template. -The next step is to intialize physical data, instantiate and initialize CUTLASS kernel and run it. +The next step is to initialize physical data, instantiate and initialize CUTLASS kernel and run it. We use CUTLASS utilities to initialize, fill, compare matrices as they are simple and doesn't come in the way of learning CUTLASS. @@ -115,7 +115,7 @@ Once all the matrices are initialized and filled with data, create arguments tup kernel which takes problem size (M = 5120, N = 4096 and K = 4096), matrices, alpha, beta and the important one, split k-dimension factor. Along with that, we query CUTLASS if any scratch-space memory required by the kernel we instantiated. If yes, we create it and pass it along with other -arguments created to intialize CUTLASS kernel then, the kernel is launched. +arguments created to initialize CUTLASS kernel then, the kernel is launched. In this example, we later on launch a reference gemm kernel (from CUTLASS utilities) to compare if the output from CUTLASS kernel is same as reference GEMM kernel. diff --git a/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu b/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu index bcff579b..f627b842 100644 --- a/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu +++ b/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu @@ -74,7 +74,7 @@ Now that we setup the properties of data, we have to setup properties of computa Second, we create template variables of tile sizes for thread-block, warp and mma-op to 128x256x64, 64x64x16, 8x8x16 (MxNxK) respectively. When passed to instantiate CUTLASS GEMM kernel, it internally deduce the amount of threads needed per thread-block, amount of shared memory, storing data in -bank-conflict free manner, and ton of other variables required to compose, intialize and launch a +bank-conflict free manner, and ton of other variables required to compose, initialize and launch a high performance GEMM kernel. This is the beauty of CUTLASS, it relieves developer from understanding and coding complicated hardware optimizations which can easily go wrong. @@ -106,7 +106,7 @@ is done which threadblock launched on an SM, CUDA SM architecture of GPU you wan These are all put together to create a template variable which describes CUTLASS GEMM kernel using cutlass::gemm::device::Gemm template. -The next step is to intialize physical data, instantiate and initialize CUTLASS kernel and run it. +The next step is to initialize physical data, instantiate and initialize CUTLASS kernel and run it. We use CUTLASS utilities to initialize, fill, compare matrices as they are simple and doesn't come in the way of learning CUTLASS. @@ -114,7 +114,7 @@ Once all the matrices are initialized and filled with data, create arguments tup kernel which takes problem size (M = 5120, N = 4096 and K = 4096), matrices, alpha, beta and the important one, split k-dimension factor. Along with that, we query CUTLASS if any scratch-space memory required by the kernel we instantiated. If yes, we create it and pass it along with other -arguments created to intialize CUTLASS kernel then, the kernel is launched. +arguments created to initialize CUTLASS kernel then, the kernel is launched. In this example, we later on launch a reference gemm kernel (from CUTLASS utilities) to compare if the output from CUTLASS kernel is same as reference GEMM kernel. diff --git a/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu b/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu index e39784ee..bfa4f8f3 100644 --- a/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu +++ b/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu @@ -76,7 +76,7 @@ Now that we setup the properties of data, we have to setup properties of computa Second, we create template variables of tile sizes for thread-block, warp and mma-op to 128x128x128, 64x64x128, 8x8x32 (MxNxK) respectively. When passed to instantiate CUTLASS Implicit GEMM kernel, it internally deduces the amount of threads needed per thread-block, amount of shared memory, storing -data in bank-conflict free manner, and ton of other variables required to compose, intialize and +data in bank-conflict free manner, and ton of other variables required to compose, initialize and launch a high performance Implicit GEMM kernel. This is the beauty of CUTLASS, it relieves developer from understanding and coding complicated hardware optimizations which can easily go wrong. @@ -108,7 +108,7 @@ is done which threadblock launched on an SM, CUDA SM architecture of GPU you wan These are all put together to create a template variable which describes CUTLASS Implicit GEMM kernel using cutlass::conv::device::ImplicitGemm template. -The next step is to intialize physical data, instantiate and initialize CUTLASS kernel and run it. +The next step is to initialize physical data, instantiate and initialize CUTLASS kernel and run it. We use CUTLASS utilities to initialize, fill, compare tensors as they are simple and doesn't come in the way of learning CUTLASS. @@ -117,7 +117,7 @@ kernel which takes problem size (N = 1, H = 64, W = 64, C = 128), filter size (K R = 3, S = 3, C = 128 ), padding, strides, dilation, tensors, alpha, beta and the important one, split k-dimension factor. Along with that, we query CUTLASS if any scratch-space memory required by the kernel we instantiated. If yes, we create it and pass it along with other -arguments created to intialize CUTLASS kernel then, the kernel is launched. +arguments created to initialize CUTLASS kernel then, the kernel is launched. In this example, we later on launch a reference convolution kernel (from CUTLASS utilities) to compare if the output from CUTLASS kernel is same as the reference implicit GEMM kernel. diff --git a/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_pipelined.h b/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_pipelined.h index 36d4563a..b3dfd6f4 100644 --- a/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_pipelined.h +++ b/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_pipelined.h @@ -321,7 +321,7 @@ class B2bImplicitGemmPipelined : int smem_write_stage_idx = 1; // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing - // shared memory loads (which have the tighest latency requirement). + // shared memory loads (which have the tightest latency requirement). // // Mainloop @@ -461,7 +461,7 @@ class B2bImplicitGemmPipelined : int gemm_k_iterations_1 = FragmentIteratorA1::Policy::kIterations / Base::kWarpGemmIterations1; // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing - // shared memory loads (which have the tighest latency requirement). + // shared memory loads (which have the tightest latency requirement). // // Mainloop diff --git a/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_pipelined_smem_accumulator.h b/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_pipelined_smem_accumulator.h index 828426b1..c1799fa3 100644 --- a/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_pipelined_smem_accumulator.h +++ b/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_pipelined_smem_accumulator.h @@ -341,7 +341,7 @@ class B2bImplicitGemmPipelinedSmemAccumulator : int smem_write_stage_idx = 1; // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing - // shared memory loads (which have the tighest latency requirement). + // shared memory loads (which have the tightest latency requirement). // // Mainloop diff --git a/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_pipelined.h b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_pipelined.h index c36d1332..7afa503a 100644 --- a/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_pipelined.h +++ b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_pipelined.h @@ -325,7 +325,7 @@ class B2bMmaPipelined : iterator_B0.clear_mask(gemm_k_iterations_0 <= 1); // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing - // shared memory loads (which have the tighest latency requirement). + // shared memory loads (which have the tightest latency requirement). // // Mainloop diff --git a/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_pipelined_smem_accumulator.h b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_pipelined_smem_accumulator.h index 351fae37..b78892e1 100644 --- a/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_pipelined_smem_accumulator.h +++ b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_pipelined_smem_accumulator.h @@ -346,7 +346,7 @@ class B2bMmaPipelinedSmemAccumulator : iterator_B0.clear_mask(gemm_k_iterations_0 <= 1); // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing - // shared memory loads (which have the tighest latency requirement). + // shared memory loads (which have the tightest latency requirement). // // Mainloop diff --git a/examples/16_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu b/examples/16_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu index 378b4898..b30d9086 100644 --- a/examples/16_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu +++ b/examples/16_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu @@ -73,7 +73,7 @@ Now that we setup the properties of data, we have to setup properties of computa Second, we create template variables of tile sizes for thread-block, warp and mma-op to 128x128x64, 64x64x64, 16x8x16 (MxNxK) respectively. When passed to instantiate CUTLASS Implicit GEMM kernel, it internally deduces the amount of threads needed per thread-block, amount of shared memory, storing -data in bank-conflict free manner, and ton of other variables required to compose, intialize and +data in bank-conflict free manner, and ton of other variables required to compose, initialize and launch a high performance Implicit GEMM kernel. This is the beauty of CUTLASS, it relieves developer from understanding and coding complicated hardware optimizations which can easily go wrong. @@ -95,7 +95,7 @@ is done which threadblock launched on an SM, CUDA SM architecture of GPU you wan These are all put together to create a template variable which describes CUTLASS Implicit GEMM kernel using cutlass::conv::device::ImplicitGemm template. -The next step is to intialize physical data, instantiate and initialize CUTLASS kernel and run it. +The next step is to initialize physical data, instantiate and initialize CUTLASS kernel and run it. We use CUTLASS utilities to initialize, fill, compare tensors as they are simple and doesn't come in the way of learning CUTLASS. @@ -104,7 +104,7 @@ kernel which takes problem size (N = 1, H = 64, W = 64, C = 128), filter size (K R = 3, S = 3, C = 128 ), padding, strides, dilation, tensors, alpha, beta and the important one, split k-dimension factor. Along with that, we query CUTLASS if any scratch-space memory required by the kernel we instantiated. If yes, we create it and pass it along with other -arguments created to intialize CUTLASS kernel then, the kernel is launched. +arguments created to initialize CUTLASS kernel then, the kernel is launched. In this example, we later on launch a reference convolution kernel (from CUTLASS utilities) to compare if the output from CUTLASS kernel is same as the reference implicit GEMM kernel. diff --git a/examples/23_ampere_gemm_operand_reduction_fusion/ampere_gemm_operand_reduction_fusion.cu b/examples/23_ampere_gemm_operand_reduction_fusion/ampere_gemm_operand_reduction_fusion.cu index 81a3e15a..49bfe2f8 100644 --- a/examples/23_ampere_gemm_operand_reduction_fusion/ampere_gemm_operand_reduction_fusion.cu +++ b/examples/23_ampere_gemm_operand_reduction_fusion/ampere_gemm_operand_reduction_fusion.cu @@ -36,7 +36,7 @@ computing GEMM. So the output also contains either a Mx1 or 1XN vector. It onl core instructions. Most of the reduction is done in gemm/warp level, see gemm/warp/mma_with_reduction_tensor_op.h -A few bit of reduction is done in the epilouge before storing the vector, see +A few bit of reduction is done in the epilogue before storing the vector, see epilogue/threadblock/epilogue_gemm_k_reduction.h */ diff --git a/examples/41_fused_multi_head_attention/fused_multihead_attention_fixed_seqlen.cu b/examples/41_fused_multi_head_attention/fused_multihead_attention_fixed_seqlen.cu index d3ffef76..a0604018 100644 --- a/examples/41_fused_multi_head_attention/fused_multihead_attention_fixed_seqlen.cu +++ b/examples/41_fused_multi_head_attention/fused_multihead_attention_fixed_seqlen.cu @@ -1088,7 +1088,7 @@ int main(int argc, char const **args) { // Determine kernel configuration based on head size. // If head size is less than or equal to 64, each block operates over 64 queries and - // 64 keys, and parital results can be stored in the register file. + // 64 keys, and partial results can be stored in the register file. // If head size is greater than 64, each block operates over 32 queries and 128 keys, // and partial results are stored in shared memory. if (options.head_size_v > 64) { diff --git a/examples/41_fused_multi_head_attention/fused_multihead_attention_variable_seqlen.cu b/examples/41_fused_multi_head_attention/fused_multihead_attention_variable_seqlen.cu index f3e2879f..f2568e3a 100644 --- a/examples/41_fused_multi_head_attention/fused_multihead_attention_variable_seqlen.cu +++ b/examples/41_fused_multi_head_attention/fused_multihead_attention_variable_seqlen.cu @@ -1173,7 +1173,7 @@ int main(int argc, char const **args) { // Determine kernel configuration based on head size. // If head size is less than or equal to 64, each block operates over 64 queries and - // 64 keys, and parital results can be stored in the register file. + // 64 keys, and partial results can be stored in the register file. // If head size is greater than 64, each block operates over 32 queries and 128 keys, // and partial results are stored in shared memory. if (options.head_size_v > 64) { diff --git a/examples/41_fused_multi_head_attention/gemm/custom_mma_pipelined.h b/examples/41_fused_multi_head_attention/gemm/custom_mma_pipelined.h index 73112e9a..f074fdbd 100644 --- a/examples/41_fused_multi_head_attention/gemm/custom_mma_pipelined.h +++ b/examples/41_fused_multi_head_attention/gemm/custom_mma_pipelined.h @@ -310,7 +310,7 @@ class CustomMmaPipelined : public CustomMmaBase { iterator_B.clear_mask(gemm_k_iterations <= 1); // Issue loads during the first warp-level matrix multiply-add *AFTER* - // issuing shared memory loads (which have the tighest latency requirement). + // issuing shared memory loads (which have the tightest latency requirement). // // Mainloop diff --git a/examples/41_fused_multi_head_attention/gemm/mma_from_smem.h b/examples/41_fused_multi_head_attention/gemm/mma_from_smem.h index 993af37a..bc67806b 100644 --- a/examples/41_fused_multi_head_attention/gemm/mma_from_smem.h +++ b/examples/41_fused_multi_head_attention/gemm/mma_from_smem.h @@ -600,7 +600,7 @@ class MmaPipelinedFromSharedMemory : public MmaBaseFromSharedMemory< iterator_B.clear_mask(gemm_k_iterations <= 1); // Issue loads during the first warp-level matrix multiply-add *AFTER* - // issuing shared memory loads (which have the tighest latency requirement). + // issuing shared memory loads (which have the tightest latency requirement). // // Mainloop diff --git a/examples/41_fused_multi_head_attention/iterators/predicated_tile_access_iterator_residual_last.h b/examples/41_fused_multi_head_attention/iterators/predicated_tile_access_iterator_residual_last.h index b9c38cc3..d49bf83e 100644 --- a/examples/41_fused_multi_head_attention/iterators/predicated_tile_access_iterator_residual_last.h +++ b/examples/41_fused_multi_head_attention/iterators/predicated_tile_access_iterator_residual_last.h @@ -181,7 +181,7 @@ class PredicatedTileAccessIteratorResidualLast< BytePointer pointer_; /// Below is used when Gather is turned on. We need to record strided_offset - /// and contiguous_offset seperated to compute the offset by using + /// and contiguous_offset separated to compute the offset by using /// /// offset = contiguous_offset + indices[strided_offset] /// diff --git a/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_threadblock.py b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_threadblock.py index 7512dc1c..bb3c76fc 100644 --- a/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_threadblock.py +++ b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_threadblock.py @@ -86,14 +86,14 @@ def gen_using_MmaCore(self, stage): "OperatorClass", str(stage), "Operator") return gen_code - def gen_using_FusedAddBiasEpilouge(self): + def gen_using_FusedAddBiasEpilogue(self): gen_code = "" for i in range(self.b2b_num - 1): - code_using = helper.var_idx("using FusedAddBiasEpilouge", i) - epilouge_name = "typename cutlass::epilogue::threadblock::DefaultFusedBiasActEpilogueTensorOp" + code_using = helper.var_idx("using FusedAddBiasEpilogue", i) + epilogue_name = "typename cutlass::epilogue::threadblock::DefaultFusedBiasActEpilogueTensorOp" template_args = helper.var_idx("::Epilogue" - gen_code += code_using + " = " + epilouge_name + template_args + ";\n" + gen_code += code_using + " = " + epilogue_name + template_args + ";\n" return gen_code @@ -161,12 +161,12 @@ def gen_threadblockmma(self): MmaPipelined_param_list += "ElementAccumulator0, layout::RowMajor, " for i in range(self.b2b_num - 1): - epilouge_name = "EpilogueOutputOp" + str(i) - MmaPipelined_param_list += epilouge_name + ", " + epilogue_name = "EpilogueOutputOp" + str(i) + MmaPipelined_param_list += epilogue_name + ", " for i in range(self.b2b_num - 1): - epilouge_name = "FusedAddBiasEpilouge" + str(i) - MmaPipelined_param_list += epilouge_name + ", " + epilogue_name = "FusedAddBiasEpilogue" + str(i) + MmaPipelined_param_list += epilogue_name + ", " for i in range(self.b2b_num): MmaPolicy = "typename MmaCore" + str(i) + "::MmaPolicy" @@ -198,7 +198,7 @@ def gen_code(self): mmacore_codebody = self.gen_using_MmaCore(2) iterator_codebody = self.gen_using_Iterator() fragment_iterator_codebody = self.gen_fragment_iterator() - epilogue_iterator_codebody = self.gen_using_FusedAddBiasEpilouge() + epilogue_iterator_codebody = self.gen_using_FusedAddBiasEpilogue() threadBlockMma = self.gen_threadblockmma() specialized_code = mmacore_codebody + iterator_codebody + fragment_iterator_codebody + epilogue_iterator_codebody + threadBlockMma @@ -352,7 +352,7 @@ def gen_first_gemm_1stage(b2b_num): }\n\ \n\ // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing \n\ - // shared memory loads (which have the tighest latency requirement).\n\ + // shared memory loads (which have the tightest latency requirement).\n\ \n\ //\n\ // Mainloop\n\ @@ -459,7 +459,7 @@ def gen_first_gemm_2stage(b2b_num): }\n\ \n\ // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing \n\ - // shared memory loads (which have the tighest latency requirement).\n\ + // shared memory loads (which have the tightest latency requirement).\n\ iterator_A.load(tb_frag_A);\n\ \n\ //\n\ @@ -490,7 +490,7 @@ def gen_first_gemm_2stage(b2b_num): __syncthreads();\n\ \n\ // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing \n\ - // shared memory loads (which have the tighest latency requirement).\n\ + // shared memory loads (which have the tightest latency requirement).\n\ iterator_A.load(tb_frag_A);\n\ \n\ ++this->smem_iterator_B0_;\n\ @@ -549,12 +549,12 @@ def gemm_teamplate(id): code = "// " + str(id + 1) + " Gemm" code += " /// Iterator to load a warp-scoped tile of A1 operand from intermediate accumulator tile\n" - code += " " + helper.var_idx("FragmentC", id - 1) + helper.var_idx(" after_epilouge_accu", id - 1) + ";\n" + code += " " + helper.var_idx("FragmentC", id - 1) + helper.var_idx(" after_epilogue_accu", id - 1) + ";\n" code += " " + helper.var_idx("epilogue_", id - 1) + helper.var_idx("(output_op_", id - 1) + helper.var_idx(", accum", id - 1) \ - + helper.var_idx(", after_epilouge_accu", id - 1) + helper.var_idx(", iterator_C", id - 1) +");\n" + + helper.var_idx(", after_epilogue_accu", id - 1) + helper.var_idx(", iterator_C", id - 1) +");\n" # FragmentIteratorA1 warp_tile_iterator_A1_(accum0); - code += " " + helper.var_idx("FragmentIteratorA", id) + helper.var_idx(" warp_tile_iterator_A", id) +"_(" + helper.var_idx("after_epilouge_accu", id - 1) + ");\n" + code += " " + helper.var_idx("FragmentIteratorA", id) + helper.var_idx(" warp_tile_iterator_A", id) +"_(" + helper.var_idx("after_epilogue_accu", id - 1) + ");\n" # FragmentB1 tb_frag_B1; code += " " + helper.var_idx("FragmentB", id) + " " + helper.var_idx("tb_frag_B", id) + ";\n" # tb_frag_B1.clear(); @@ -990,7 +990,7 @@ def __init__(self, template_param, gen_class_name, b2b_num, output_dir, cutlass_ self.gen_b2b_mma_base = gen_b2b_mma_base(template_param, gen_class_name, b2b_num, cutlass_deps_root, project_root) - self.gen_b2b_mma_piplined = gen_b2b_mme_pipelined(template_param, gen_class_name, b2b_num, cutlass_deps_root, project_root) + self.gen_b2b_mma_pipelined = gen_b2b_mme_pipelined(template_param, gen_class_name, b2b_num, cutlass_deps_root, project_root) self.gen_default_b2b_mma = gen_default_b2b_mma(template_param, gen_class_name, b2b_num, cutlass_deps_root, project_root) @@ -1001,7 +1001,7 @@ def gen_code(self, first_use_1stage): with open(self.file_dir + "b2b_mma_base.h", "w+") as f: f.write(base_code) - pipeline_code = self.gen_b2b_mma_piplined.gen_code(first_use_1stage = first_use_1stage) + pipeline_code = self.gen_b2b_mma_pipelined.gen_code(first_use_1stage = first_use_1stage) print("[INFO]: Gen kernel code [b2b_mma_pipelined.h]output Dir: is ", self.file_dir) with open(self.file_dir + "b2b_mma_pipelined.h", "w+") as f: diff --git a/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_verify.py b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_verify.py index 9eb6fa9c..ede9c1f8 100644 --- a/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_verify.py +++ b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_verify.py @@ -45,7 +45,7 @@ def __init__(self, fuse_gemm_info, gen_class_name, user_header_file, output_dir self.user_header_file = "" for header in user_header_file: self.user_header_file += "#include \"" + header + "\"\n" - self.seperate_cutlass = gen_basic.gen_volta_turing_fuse_act_impl(fuse_gemm_info, gen_class_name, user_header_file, output_dir) + self.separate_cutlass = gen_basic.gen_volta_turing_fuse_act_impl(fuse_gemm_info, gen_class_name, user_header_file, output_dir) self.gen_params() self.output_dir = output_dir @@ -53,14 +53,14 @@ def __init__(self, fuse_gemm_info, gen_class_name, user_header_file, output_dir def gen_code(self): code = "" code += self.user_header_file - code += self.seperate_cutlass.gen_using(False) #False -> Turing, True -> Volta + code += self.separate_cutlass.gen_using(False) #False -> Turing, True -> Volta code_body = "" for i in range(self.b2b_num): code_body += " " + helper.var_idx("Gemm", i) + helper.var_idx(" gemm_op_", i) + ";\n" code_body += " " + helper.var_idx("gemm_op_", i) + helper.var_idx(".initialize(Arguments_", i) + ", nullptr);\n" - code_body += self.seperate_cutlass.gen_run() + code_body += self.separate_cutlass.gen_run() code += ir.gen_func(self.name, self.params, code_body) helper.write_2_headfile("cutlass_verify.h", self.output_dir, code) @@ -87,6 +87,6 @@ def get_params(self, declartion = True): def gen_initialize(): code = "" - initialize_code = self.seperate_cutlass.gen_initialize() + initialize_code = self.separate_cutlass.gen_initialize() code = ir.gen_func("initialize", [[]]) diff --git a/examples/44_multi_gemm_ir_and_codegen/ir_gen/helper.py b/examples/44_multi_gemm_ir_and_codegen/ir_gen/helper.py index f221c2b6..e271a975 100644 --- a/examples/44_multi_gemm_ir_and_codegen/ir_gen/helper.py +++ b/examples/44_multi_gemm_ir_and_codegen/ir_gen/helper.py @@ -83,23 +83,23 @@ def list_2_string(input_list, ): return rtn_string -def get_epilouge_info(layer_info): +def get_epilogue_info(layer_info): return layer_info['epilogue'] def get_epilogue_tp(layer_info): - epilogue_info = get_epilouge_info(layer_info) + epilogue_info = get_epilogue_info(layer_info) return epilogue_info['tp'] def get_epilogue_add_bias_or_not(layer_info): - epilogue_info = get_epilouge_info(layer_info) + epilogue_info = get_epilogue_info(layer_info) return epilogue_info['bias']['addbias'] def get_epilogue_add_bias_tp(layer_info): - epilogue_info = get_epilouge_info(layer_info) + epilogue_info = get_epilogue_info(layer_info) return epilogue_info['bias']['bias_tp'] def get_epilogue_args(layer_info): - epilogue_info = get_epilouge_info(layer_info) + epilogue_info = get_epilogue_info(layer_info) return epilogue_info['args'] def get_epilogue_bias_shape(layer_info): diff --git a/examples/49_hopper_gemm_schedules_with_collective_builder/49_hopper_gemm_schedules_with_collective_builder.cu b/examples/49_hopper_gemm_schedules_with_collective_builder/49_hopper_gemm_schedules_with_collective_builder.cu index 1d92bef9..ccf74a65 100644 --- a/examples/49_hopper_gemm_schedules_with_collective_builder/49_hopper_gemm_schedules_with_collective_builder.cu +++ b/examples/49_hopper_gemm_schedules_with_collective_builder/49_hopper_gemm_schedules_with_collective_builder.cu @@ -33,7 +33,7 @@ \brief Hopper GEMM example leveraging collective operation builders. This example showcases the use of CUTLASS's CollectiveBuilder to easily construct performant kernels - targetting the NVIDIA Hopper architecture. + targeting the NVIDIA Hopper architecture. Background and motivation ------------------------- @@ -45,7 +45,7 @@ However, DefaultGemmConfigurations leave multiple opportunities for improvement, which are addressed in CUTLASS 3: (1) DefaultGemmConfigurations do not allow one to use a more-performant set of parameters without - specifying every parameter. For example, the DefaultGemmConfigurations for GEMMs targetting + specifying every parameter. For example, the DefaultGemmConfigurations for GEMMs targeting Ampere specify that three pipeline stages should be used regardless of the sizes of operands. If one wished to increase this value, one would also need to specify all other template parameters. This leaves a gap between a high-level ease-of-use interface and a lower-level detailed interface. @@ -55,7 +55,7 @@ Alongside these opportunities for improvement, the Hopper architecture offers new features that increase the number of valid configurations of a kernel. In addition to the many template parameters already available - in CUTLASS 2 kernels, CUTLASS 3 kernels targetting Hopper also have various scheduling modes to select from that control: + in CUTLASS 2 kernels, CUTLASS 3 kernels targeting Hopper also have various scheduling modes to select from that control: (1) how data is to be loaded (e.g., using the Hopper TMA feature or Ampere cp.async) (2) how work is to be divided among warps in a thread block (e.g., whether to use "warp specialization") (3) whether persistent thread blocks should be used @@ -64,13 +64,13 @@ Introduction to the CollectiveBuilder ------------------------------------- CUTLASS 3 introduces the CollectiveBuilder to further ease the process of selecting template parameters - for kernels targetting Hopper. Similar to the DefaultGemmConfigurations used in CUTLASS 2, the CollectiveBuilder + for kernels targeting Hopper. Similar to the DefaultGemmConfigurations used in CUTLASS 2, the CollectiveBuilder takes in a small set of template parameters (e.g., the data types of operands A and B). It then automatically determines the data loading strategy to use depending on whether the Hopper TMA feature can be used with the provided parameters. If one does not indicate a particular scheduling policy or stage count to use (by using `Auto` template parameters), the CollectiveBuilder will also automatically select these. - Unlike DefaultGemmConfigurations a parital specialization of the CollectiveBuilder is not needed for many + Unlike DefaultGemmConfigurations a partial specialization of the CollectiveBuilder is not needed for many configurations of operand types. Instead the CollectiveBuilder "builds" a configuration based on generic properties of the specified operands, layouts, and other parameters. For example, when the stage count is set to `Auto`, the CollectiveBuilder may automatically calculate the maximum number of stages that @@ -90,7 +90,7 @@ Details of this example ----------------------- This example walks through the use of the CollectiveBuilder with various schedules and stage counts specified. - This example also illustrates how CUTLASS 3 GEMMs targetting Hopper automatically support batched GEMMs by simply + This example also illustrates how CUTLASS 3 GEMMs targeting Hopper automatically support batched GEMMs by simply extending the problem size with an additional tensor rank. Example usage: @@ -162,7 +162,7 @@ struct Options { out << "49_hopper_gemm_schedules_with_collective_builder\n\n" << " This example showcases the use of CUTLASS's collective operation builders to easily construct\n" - << " performant kernels targetting NVIDIA's Hopper architecture.\n\n" + << " performant kernels targeting NVIDIA's Hopper architecture.\n\n" << "Options:\n\n" << " --help If specified, displays this usage statement\n\n" << " --m= Sets the M extent of the GEMM\n" diff --git a/include/cute/atom/copy_traits_sm90_tma.hpp b/include/cute/atom/copy_traits_sm90_tma.hpp index 18e22bf6..f414ebc4 100644 --- a/include/cute/atom/copy_traits_sm90_tma.hpp +++ b/include/cute/atom/copy_traits_sm90_tma.hpp @@ -718,7 +718,7 @@ make_tma_copy(CopyOp, << "\nswizzle " << smem_swizzle << "\nl2Promotion " << tma_l2Promotion << "\noobFill " << tma_oobFill << std::endl; - std::cerr << "Error: Failed to intialize the TMA descriptor " << result << std::endl; + std::cerr << "Error: Failed to initialize the TMA descriptor " << result << std::endl; assert(false); } #endif // (__CUDACC_VER_MAJOR__ >= 12) diff --git a/include/cutlass/arch/mma.h b/include/cutlass/arch/mma.h index 7d4d693a..537f215a 100644 --- a/include/cutlass/arch/mma.h +++ b/include/cutlass/arch/mma.h @@ -98,11 +98,11 @@ struct OpClassSimt {}; ///////////////////////////////////////////////////////////////////////////////////////////////// -/// Tag classifing operators as Tensor Core operations. +/// Tag classifying operators as Tensor Core operations. struct OpClassTensorOp {}; ///////////////////////////////////////////////////////////////////////////////////////////////// -/// Tag classifing operators as WMMA Tensor Core operations +/// Tag classifying operators as WMMA Tensor Core operations struct OpClassWmmaTensorOp {}; ///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_optimized.h index 38d94acc..03e77c04 100644 --- a/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_optimized.h +++ b/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_optimized.h @@ -230,7 +230,7 @@ class Conv2dDgradOutputGradientTileAccessIteratorOptimized < offset_p[s] = (mapped_h + problem_size_.pad_h - filter_r) / problem_size_.stride_h; offset_q[s] = (mapped_w + problem_size_.pad_w - filter_s) / problem_size_.stride_w; - // Intialize pointers for gemm_k=0 + // Initialize pointers for gemm_k=0 TensorCoord coord{offset_n[s], offset_p[s], offset_q[s], filter_k_}; pointer_[s] += params_.layout(coord) * sizeof_bits::value / 8; @@ -341,7 +341,7 @@ class Conv2dDgradOutputGradientTileAccessIteratorOptimized < next_idx = 1; - // Restore bytes in q coordinate (Mma in filter s dimenstion) + // Restore bytes in q coordinate (Mma in filter s dimension) reset_bytes = reset_bytes_s_; } else { @@ -351,7 +351,7 @@ class Conv2dDgradOutputGradientTileAccessIteratorOptimized < next_idx = 2; - // Restore bytes in p and q coordinate (Mma in filter s and r dimenstion) + // Restore bytes in p and q coordinate (Mma in filter s and r dimension) reset_bytes = reset_bytes_r_; } #else diff --git a/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_analytic.h index 6e73115c..c72356be 100644 --- a/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_analytic.h +++ b/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_analytic.h @@ -195,7 +195,7 @@ class Conv2dWgradActivationTileAccessIteratorAnalytic { s = filter_s_[iteration_contiguous_]; } else { - /// Multiple access to support non-128b alignment in contiguous dimenstion + /// Multiple access to support non-128b alignment in contiguous dimension c = (filter_c_[iteration_contiguous_] + iteration_vector_ * AccessType::kElements) % problem_size_.C; int wrap_c = (filter_c_[iteration_contiguous_] + iteration_vector_ * AccessType::kElements) / problem_size_.C; s = (filter_s_[iteration_contiguous_] + wrap_c) % problem_size_.S; diff --git a/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_optimized.h index 88717359..16cd2564 100644 --- a/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_optimized.h +++ b/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_optimized.h @@ -212,7 +212,7 @@ class Conv2dWgradActivationTileAccessIteratorOptimized { if (kAccessesPerVector > 1) { // This code section is only to support non-128b alignment - // Multiple access to support non-128b alignment in contiguous dimenstion + // Multiple access to support non-128b alignment in contiguous dimension int wrap_c; params_.c_divmod(wrap_c, c, c + iteration_vector_ * AccessType::kElements); diff --git a/include/cutlass/conv/threadblock/depthwise_fprop_pipelined.h b/include/cutlass/conv/threadblock/depthwise_fprop_pipelined.h index fd43e404..1f82769b 100644 --- a/include/cutlass/conv/threadblock/depthwise_fprop_pipelined.h +++ b/include/cutlass/conv/threadblock/depthwise_fprop_pipelined.h @@ -241,7 +241,7 @@ class DepthwiseFpropPipelined : public gemm::threadblock::MmaBase filter, i.e., stride={2x2} and filter={1x1}) // // * Optimization * - // Only launch CTAs in M dimenstion which contribute to a row in Dx output + // Only launch CTAs in M dimension which contribute to a row in Dx output // // // * Constraints * @@ -107,7 +107,7 @@ struct StridedDgradHorizontalThreadblockSwizzle : // compute number of tiles in m dimension int tile_m = get_strided_dgrad_tile_m(problem_size, tile_size.m()); - // compute number of tiles in n dimenstion + // compute number of tiles in n dimension int tile_n = (implicit_gemm_problem_size.n() + tile_size.n() - 1) / tile_size.n(); return gemm::GemmCoord( @@ -148,7 +148,7 @@ struct StridedDgradIdentityThreadblockSwizzle : // compute number of tiles in m dimension int tile_m = get_strided_dgrad_tile_m(problem_size, tile_size.m()); - // compute number of tiles in n dimenstion + // compute number of tiles in n dimension int tile_n = (implicit_gemm_problem_size.n() + tile_size.n() - 1) / tile_size.n(); return gemm::GemmCoord( diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h b/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h index 1b258164..2ec93389 100644 --- a/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h +++ b/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h @@ -77,7 +77,7 @@ namespace threadblock { // D = dr + j di = (ar*br - ai*bi) + j (ar*bi + ai*br) ///////////////////////////////////////////////////////////////////////////////////////////////// template < - /// Epilouge Shape + /// Epilogue Shape typename Shape_, /// Warp-level mma operator typename WarpMmaTensorOp_, diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op_blas3.h b/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op_blas3.h index 966d44cc..0b2da7d2 100644 --- a/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op_blas3.h +++ b/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op_blas3.h @@ -78,7 +78,7 @@ namespace threadblock { // D = dr + j di = (ar*br - ai*bi) + j (ar*bi + ai*br) ///////////////////////////////////////////////////////////////////////////////////////////////// template < - /// Epilouge Shape + /// Epilogue Shape typename Shape_, /// Warp-level mma operator typename WarpMmaTensorOp_, diff --git a/include/cutlass/epilogue/threadblock/predicated_tile_iterator_direct_conv.h b/include/cutlass/epilogue/threadblock/predicated_tile_iterator_direct_conv.h index a641f608..ada7c467 100644 --- a/include/cutlass/epilogue/threadblock/predicated_tile_iterator_direct_conv.h +++ b/include/cutlass/epilogue/threadblock/predicated_tile_iterator_direct_conv.h @@ -198,7 +198,7 @@ class PredicatedTileIteratorDirectConv { /// A thread's starting column Index thread_start_column_; - /// Initial thread ouput location + /// Initial thread output location int thread_start_n_, thread_start_p_, thread_start_q_; /// Current threadblock tile index diff --git a/include/cutlass/epilogue/threadblock/predicated_tile_iterator_strided_dgrad.h b/include/cutlass/epilogue/threadblock/predicated_tile_iterator_strided_dgrad.h index 1e8c71ec..654d09c3 100644 --- a/include/cutlass/epilogue/threadblock/predicated_tile_iterator_strided_dgrad.h +++ b/include/cutlass/epilogue/threadblock/predicated_tile_iterator_strided_dgrad.h @@ -186,10 +186,10 @@ class PredicatedTileIteratorStridedDgrad { /// Extent of the matrix tile in rows Index extent_row_; - /// Starting Dx h and w dimenstion for strided dgrad mapping + /// Starting Dx h and w dimension for strided dgrad mapping int start_h_, start_w_; - /// Effective Dy P and Q dimenstions for strided dgrad mapping + /// Effective Dy P and Q dimensions for strided dgrad mapping int p_, q_; /// A thread's starting row position (assuming steady-state predicates have been computed) diff --git a/include/cutlass/gemm/device/ell_gemm.h b/include/cutlass/gemm/device/ell_gemm.h index d8698a7c..630ab7be 100644 --- a/include/cutlass/gemm/device/ell_gemm.h +++ b/include/cutlass/gemm/device/ell_gemm.h @@ -547,7 +547,7 @@ class EllGemm { //////////////////////////////////////////////////////////////////////////////// -/// Parital specialization for column-major output exchanges problem size and operand. +/// Partial specialization for column-major output exchanges problem size and operand. template < /// Element type for A matrix operand typename ElementA_, diff --git a/include/cutlass/gemm/device/gemm.h b/include/cutlass/gemm/device/gemm.h index 68fa29bf..482c4b4a 100644 --- a/include/cutlass/gemm/device/gemm.h +++ b/include/cutlass/gemm/device/gemm.h @@ -521,7 +521,7 @@ class Gemm { //////////////////////////////////////////////////////////////////////////////// -/// Parital specialization for column-major output exchanges problem size and operand. +/// Partial specialization for column-major output exchanges problem size and operand. template < /// Element type for A matrix operand typename ElementA_, diff --git a/include/cutlass/gemm/device/gemm_array.h b/include/cutlass/gemm/device/gemm_array.h index dd244f88..a937da48 100644 --- a/include/cutlass/gemm/device/gemm_array.h +++ b/include/cutlass/gemm/device/gemm_array.h @@ -476,7 +476,7 @@ class GemmArray { //////////////////////////////////////////////////////////////////////////////// -/// Parital specialization for column-major output exchanges problem size and operand. +/// Partial specialization for column-major output exchanges problem size and operand. template < /// Element type for A matrix operand typename ElementA_, diff --git a/include/cutlass/gemm/device/gemm_batched.h b/include/cutlass/gemm/device/gemm_batched.h index 6f510e90..a27f0b8a 100644 --- a/include/cutlass/gemm/device/gemm_batched.h +++ b/include/cutlass/gemm/device/gemm_batched.h @@ -454,7 +454,7 @@ class GemmBatched { //////////////////////////////////////////////////////////////////////////////// -/// Parital specialization for column-major output exchanges problem size and operand. +/// Partial specialization for column-major output exchanges problem size and operand. template < /// Element type for A matrix operand typename ElementA_, diff --git a/include/cutlass/gemm/device/gemm_complex.h b/include/cutlass/gemm/device/gemm_complex.h index 5bd856fe..5e44d624 100644 --- a/include/cutlass/gemm/device/gemm_complex.h +++ b/include/cutlass/gemm/device/gemm_complex.h @@ -475,7 +475,7 @@ class GemmComplex { //////////////////////////////////////////////////////////////////////////////// -/// Parital specialization for column-major output exchanges problem size and operand. +/// Partial specialization for column-major output exchanges problem size and operand. template < /// Element type for A matrix operand typename ElementA_, diff --git a/include/cutlass/gemm/device/gemm_layernorm_mainloop_fusion.h b/include/cutlass/gemm/device/gemm_layernorm_mainloop_fusion.h index 3ebb2a74..d7228609 100644 --- a/include/cutlass/gemm/device/gemm_layernorm_mainloop_fusion.h +++ b/include/cutlass/gemm/device/gemm_layernorm_mainloop_fusion.h @@ -194,7 +194,7 @@ class GemmLayernormMainloopFusion : //////////////////////////////////////////////////////////////////////////////// -/// Parital specialization for column-major output exchanges problem size and operand. +/// Partial specialization for column-major output exchanges problem size and operand. template < /// Element type for A matrix operand typename ElementA_, diff --git a/include/cutlass/gemm/device/gemm_universal.h b/include/cutlass/gemm/device/gemm_universal.h index 6c19b8a1..87a8c955 100644 --- a/include/cutlass/gemm/device/gemm_universal.h +++ b/include/cutlass/gemm/device/gemm_universal.h @@ -219,7 +219,7 @@ class GemmUniversal : //////////////////////////////////////////////////////////////////////////////// -/// Parital specialization for column-major output exchanges problem size and operand. +/// Partial specialization for column-major output exchanges problem size and operand. template < /// Element type for A matrix operand typename ElementA_, diff --git a/include/cutlass/gemm/device/gemm_universal_with_broadcast.h b/include/cutlass/gemm/device/gemm_universal_with_broadcast.h index 34b3f6c7..54b7d61c 100644 --- a/include/cutlass/gemm/device/gemm_universal_with_broadcast.h +++ b/include/cutlass/gemm/device/gemm_universal_with_broadcast.h @@ -198,7 +198,7 @@ class GemmUniversalWithBroadcast : //////////////////////////////////////////////////////////////////////////////// -/// Parital specialization for column-major output exchanges problem size and operand. +/// Partial specialization for column-major output exchanges problem size and operand. template < /// Element type for A matrix operand typename ElementA_, diff --git a/include/cutlass/gemm/device/gemm_with_k_reduction.h b/include/cutlass/gemm/device/gemm_with_k_reduction.h index c671d7c6..c637a579 100644 --- a/include/cutlass/gemm/device/gemm_with_k_reduction.h +++ b/include/cutlass/gemm/device/gemm_with_k_reduction.h @@ -211,7 +211,7 @@ class GemmWithKReduction : //////////////////////////////////////////////////////////////////////////////// -/// Parital specialization for column-major output exchanges problem size and operand. +/// Partial specialization for column-major output exchanges problem size and operand. template < /// Element type for A matrix operand typename ElementA_, diff --git a/include/cutlass/gemm/device/rank_2k.h b/include/cutlass/gemm/device/rank_2k.h index d333ffa2..a81298d4 100644 --- a/include/cutlass/gemm/device/rank_2k.h +++ b/include/cutlass/gemm/device/rank_2k.h @@ -348,7 +348,7 @@ class Rank2K { }; //////////////////////////////////////////////////////////////////////////////// -/// Parital specialization for column-major output exchange operand. +/// Partial specialization for column-major output exchange operand. template < /// Element type for A matrix operand typename ElementA_, diff --git a/include/cutlass/gemm/device/rank_k.h b/include/cutlass/gemm/device/rank_k.h index a2101a79..394e7668 100644 --- a/include/cutlass/gemm/device/rank_k.h +++ b/include/cutlass/gemm/device/rank_k.h @@ -325,7 +325,7 @@ class RankK { }; //////////////////////////////////////////////////////////////////////////////// -/// Parital specialization for column-major output exchange operand. +/// Partial specialization for column-major output exchange operand. template < /// Element type for A matrix operand typename ElementA_, diff --git a/include/cutlass/gemm/device/symm.h b/include/cutlass/gemm/device/symm.h index 57bfeec5..23563b56 100755 --- a/include/cutlass/gemm/device/symm.h +++ b/include/cutlass/gemm/device/symm.h @@ -408,7 +408,7 @@ class Symm { call GEMM mainloop for with RowMajor efficient-epilogue ********************************************************************************************************/ -/// Parital specialization for column-major output exchanges problem size and operand. +/// Partial specialization for column-major output exchanges problem size and operand. template < /// Element type for A matrix operand typename ElementA_, diff --git a/include/cutlass/gemm/device/trmm.h b/include/cutlass/gemm/device/trmm.h index 34816db4..cd4ea3d4 100644 --- a/include/cutlass/gemm/device/trmm.h +++ b/include/cutlass/gemm/device/trmm.h @@ -563,7 +563,7 @@ For the mainloop and trmm kernel, `A` and `B` points to left-side and right-side call GEMM mainloop for with RowMajor efficient-epilogue ********************************************************************************************************/ -/// Parital specialization for column-major output exchanges problem size and operand. +/// Partial specialization for column-major output exchanges problem size and operand. template < /// Element type for A matrix operand typename ElementA_, diff --git a/include/cutlass/gemm/kernel/default_gemm_with_broadcast.h b/include/cutlass/gemm/kernel/default_gemm_with_broadcast.h index e3ef316b..dfe62d35 100644 --- a/include/cutlass/gemm/kernel/default_gemm_with_broadcast.h +++ b/include/cutlass/gemm/kernel/default_gemm_with_broadcast.h @@ -137,7 +137,7 @@ struct DefaultGemmWithBroadcast { ///////////////////////////////////////////////////////////////////////////////////////////////// -/// Parital specialization: ArchTag = cutlass::arch::Sm70 +/// Partial specialization: ArchTag = cutlass::arch::Sm70 /// /// template < diff --git a/include/cutlass/gemm/kernel/default_gemm_with_reduction.h b/include/cutlass/gemm/kernel/default_gemm_with_reduction.h index 6e9e647c..789b4bde 100644 --- a/include/cutlass/gemm/kernel/default_gemm_with_reduction.h +++ b/include/cutlass/gemm/kernel/default_gemm_with_reduction.h @@ -138,7 +138,7 @@ struct DefaultGemmWithReduction { ///////////////////////////////////////////////////////////////////////////////////////////////// -/// Parital specialization: ArchTag = cutlass::arch::Sm70 +/// Partial specialization: ArchTag = cutlass::arch::Sm70 /// /// template < diff --git a/include/cutlass/gemm/kernel/rank_2k_grouped_problem_visitor.h b/include/cutlass/gemm/kernel/rank_2k_grouped_problem_visitor.h index 92cc2a73..aee9c71c 100644 --- a/include/cutlass/gemm/kernel/rank_2k_grouped_problem_visitor.h +++ b/include/cutlass/gemm/kernel/rank_2k_grouped_problem_visitor.h @@ -138,7 +138,7 @@ i = i_macro j = j_macro - Handling cases with grid dimensions that aren't multiples of eachother + Handling cases with grid dimensions that aren't multiples of each other ---------------------------------------------------------------------- Even though threadblock shapes M and N are typically multiples of one another, the grid for a given problem may not have dimensions of the same ratio as that of the threadblock. diff --git a/include/cutlass/gemm/kernel/sm90_gemm_tma.hpp b/include/cutlass/gemm/kernel/sm90_gemm_tma.hpp index bd82ed11..305654d8 100644 --- a/include/cutlass/gemm/kernel/sm90_gemm_tma.hpp +++ b/include/cutlass/gemm/kernel/sm90_gemm_tma.hpp @@ -196,7 +196,7 @@ class GemmUniversal< // Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a. #if ! defined(__CUDA_ARCH_FEAT_SM90_ALL) if constexpr(size<0>(typename TiledMma::AtomShape_MNK{}) == 64) { - printf("ERROR : Arch conditional MMA instruction used without targetting sm90a compute capability. Aborting.\n"); + printf("ERROR : Arch conditional MMA instruction used without targeting sm90a compute capability. Aborting.\n"); return; } #endif diff --git a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp index 9fc719e2..f3a4a55c 100644 --- a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp +++ b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp @@ -186,7 +186,7 @@ class GemmUniversal< // Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a. #if ! defined(__CUDA_ARCH_FEAT_SM90_ALL) if constexpr(size<0>(typename TiledMma::AtomShape_MNK{}) == 64) { - printf("ERROR : Arch conditional MMA instruction used without targetting sm90a compute capability. Aborting.\n"); + printf("ERROR : Arch conditional MMA instruction used without targeting sm90a compute capability. Aborting.\n"); return; } #endif diff --git a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_persistent.hpp b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_persistent.hpp index 498bfad4..6fa93945 100644 --- a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_persistent.hpp +++ b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_persistent.hpp @@ -258,7 +258,7 @@ class GemmUniversal< // Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a. #if ! defined(__CUDA_ARCH_FEAT_SM90_ALL) if constexpr(size<0>(typename TiledMma::AtomShape_MNK{}) == 64) { - printf("ERROR : Arch conditional MMA instruction used without targetting sm90a compute capability. Aborting.\n"); + printf("ERROR : Arch conditional MMA instruction used without targeting sm90a compute capability. Aborting.\n"); return; } #endif diff --git a/include/cutlass/gemm/threadblock/ell_mma_pipelined.h b/include/cutlass/gemm/threadblock/ell_mma_pipelined.h index 10ff6dfa..8b1c2c43 100644 --- a/include/cutlass/gemm/threadblock/ell_mma_pipelined.h +++ b/include/cutlass/gemm/threadblock/ell_mma_pipelined.h @@ -271,7 +271,7 @@ class EllMmaPipelined : public MmaBase { } // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing - // shared memory loads (which have the tighest latency requirement). + // shared memory loads (which have the tightest latency requirement). // // Mainloop diff --git a/include/cutlass/gemm/threadblock/mma_planar_complex_pipelined.h b/include/cutlass/gemm/threadblock/mma_planar_complex_pipelined.h index 160c5480..d6beec45 100644 --- a/include/cutlass/gemm/threadblock/mma_planar_complex_pipelined.h +++ b/include/cutlass/gemm/threadblock/mma_planar_complex_pipelined.h @@ -321,7 +321,7 @@ class MmaPlanarComplexPipelined : iterator_B_imag.clear_mask(gemm_k_iterations <= 1); // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing - // shared memory loads (which have the tighest latency requirement). + // shared memory loads (which have the tightest latency requirement). // // Mainloop diff --git a/include/cutlass/reduction/kernel/tensor_reduce_affine_contiguous.h b/include/cutlass/reduction/kernel/tensor_reduce_affine_contiguous.h index d139ed46..5a0b9f47 100644 --- a/include/cutlass/reduction/kernel/tensor_reduce_affine_contiguous.h +++ b/include/cutlass/reduction/kernel/tensor_reduce_affine_contiguous.h @@ -83,7 +83,7 @@ struct TensorReductionAffineContiguousParams { uint64_t outer_count; /// Number of elements in outer index space ElementOutput * destination; /// Pointer to output tensor of rank kReducedRank - ElementSource const * source; /// Poitner to source pointer of rank kRank + ElementSource const * source; /// Pointer to source pointer of rank kRank ReductionOp reduction_op; /// Reduction operator ElementCompute reduction_identity; /// Identity element used by reduction operator ElementCompute *device_workspace; /// Pointer to device workspace for inter-CTA reductions diff --git a/include/cutlass/reduction/kernel/tensor_reduce_affine_strided.h b/include/cutlass/reduction/kernel/tensor_reduce_affine_strided.h index 9d5b0458..574c836d 100644 --- a/include/cutlass/reduction/kernel/tensor_reduce_affine_strided.h +++ b/include/cutlass/reduction/kernel/tensor_reduce_affine_strided.h @@ -85,7 +85,7 @@ struct TensorReductionAffineStridedParams { uint64_t outer_count; /// Number of elements in outer index space ElementOutput * destination; /// Pointer to output tensor of rank kReducedRank - ElementSource const * source; /// Poitner to source pointer of rank kRank + ElementSource const * source; /// Pointer to source pointer of rank kRank ReductionOp reduction_op; /// Reduction operator ElementCompute reduction_identity; /// Identity element for reduction operator ElementCompute *device_workspace; /// Pointer to device workspace for inter-CTA reductions diff --git a/include/cutlass/transform/threadblock/predicated_tile_access_iterator.h b/include/cutlass/transform/threadblock/predicated_tile_access_iterator.h index 29fa8af8..1026bad2 100644 --- a/include/cutlass/transform/threadblock/predicated_tile_access_iterator.h +++ b/include/cutlass/transform/threadblock/predicated_tile_access_iterator.h @@ -399,7 +399,7 @@ class PredicatedTileAccessIterator diff --git a/test/unit/conv/device/conv2d_testbed_interleaved.h b/test/unit/conv/device/conv2d_testbed_interleaved.h index 79f00d15..201d4fe7 100644 --- a/test/unit/conv/device/conv2d_testbed_interleaved.h +++ b/test/unit/conv/device/conv2d_testbed_interleaved.h @@ -517,7 +517,7 @@ class InterleavedTestbedConv2d { ///////////////////////////////////////////////////////////////////////////////////////////////////////// // TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference // TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes -// Additionaly, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes +// Additionally, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes // (conv_blacklist_sizes) ///////////////////////////////////////////////////////////////////////////////////////////////////////////// template diff --git a/test/unit/conv/device/conv2d_with_broadcast_testbed.h b/test/unit/conv/device/conv2d_with_broadcast_testbed.h index d678e3b5..1b771607 100644 --- a/test/unit/conv/device/conv2d_with_broadcast_testbed.h +++ b/test/unit/conv/device/conv2d_with_broadcast_testbed.h @@ -502,7 +502,7 @@ class TestbedConv2dWithBroadcast { ///////////////////////////////////////////////////////////////////////////////////////////////////////// // TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference // TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes -// Additionaly, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes +// Additionally, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes // (conv_blacklist_sizes) ///////////////////////////////////////////////////////////////////////////////////////////////////////////// template diff --git a/test/unit/conv/device/conv3d_testbed.h b/test/unit/conv/device/conv3d_testbed.h index a5fa1861..60b12d6f 100644 --- a/test/unit/conv/device/conv3d_testbed.h +++ b/test/unit/conv/device/conv3d_testbed.h @@ -522,7 +522,7 @@ class TestbedConv3d { ///////////////////////////////////////////////////////////////////////////////////////////////////////// // TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference // TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes -// Additionaly, each conv3d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes +// Additionally, each conv3d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes // (conv_blacklist_sizes) ///////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/test/unit/gemm/device/default_gemm_configuration.hpp b/test/unit/gemm/device/default_gemm_configuration.hpp index f84e9297..76422b15 100644 --- a/test/unit/gemm/device/default_gemm_configuration.hpp +++ b/test/unit/gemm/device/default_gemm_configuration.hpp @@ -638,7 +638,7 @@ struct DefaultGemmConfigurationToCutlass3Types< GmemTiledCopyB, SmemLayoutAtomB, SmemCopyAtomB, cute::identity // B >; - // Epilouge + // Epilogue using CollectiveEpilogue = epilogue::collective::DefaultEpilogue< TagToStrideC_t, TagToStrideC_t, diff --git a/tools/library/include/cutlass/library/handle.h b/tools/library/include/cutlass/library/handle.h index 8125989d..8a0dfcba 100644 --- a/tools/library/include/cutlass/library/handle.h +++ b/tools/library/include/cutlass/library/handle.h @@ -321,13 +321,13 @@ class Handle { NumericTypeID element_C, /// Data type of C and D matrix void const * const * ptr_C_real, /// Pointer to array containing pointers to real part of C matrices - void const * const * ptr_C_imag, /// Pointer to array containing poitners to imaginary part of C matrices + void const * const * ptr_C_imag, /// Pointer to array containing pointers to imaginary part of C matrices int64_t ldc_real, /// Leading dimension of real part of C matrix int64_t ldc_imag, /// Leading dimension of imaginary part of C matrix void * const * ptr_D_real, /// Pointer to array containing pointers to real part of D matrices - void * const * ptr_D_imag, /// Pointer to array containing poitners to imaginary part of D matrices + void * const * ptr_D_imag, /// Pointer to array containing pointers to imaginary part of D matrices int64_t ldd_real, /// Leading dimension of real part of D matrix int64_t ldd_imag /// Leading dimension of imaginary part of D matrix diff --git a/tools/library/include/cutlass/library/library.h b/tools/library/include/cutlass/library/library.h index 6bb3f799..dbd70c44 100644 --- a/tools/library/include/cutlass/library/library.h +++ b/tools/library/include/cutlass/library/library.h @@ -518,7 +518,7 @@ struct GemmDescription : public OperationDescription { ///////////////////////////////////////////////////////////////////////////////////////////////// -/// Desciprion for structured sparse GEMMs. +/// Description for structured sparse GEMMs. struct SparseGemmDescription : public GemmDescription { /// Description structure for structured sparse GEMM @@ -1160,7 +1160,7 @@ struct GemmGroupedArguments { // OperationKind: kSparseGemm // -/// Computes GEMM assumine one of the inputs has 2:4 structured sparsity. +/// Computes GEMM assuming one of the inputs has 2:4 structured sparsity. struct SparseGemmConfiguration { GemmUniversalMode mode; @@ -1187,7 +1187,7 @@ struct SparseGemmArguments { void const *B; /// pointer to B matrix void const *C; /// pointer to C matrix void *D; /// pointer to D matrix - void const *E; /// pointer to E matric (metadata) + void const *E; /// pointer to E matrix (metadata) void const *alpha; /// pointer to alpha scalar void const *beta; /// pointer to beta scalar @@ -1465,7 +1465,7 @@ struct ConvArguments { /// pointer to implicit gemm matrix C void const *C; - /// pointer to implicit gemm desitination matrix D + /// pointer to implicit gemm destination matrix D void *D; /// Host or device pointer to alpha scalar @@ -1487,16 +1487,16 @@ struct ConvArguments { // struct ReductionConfiguration { - /// Redcution problem size + /// Reduction problem size MatrixCoord problem_size; /// Number of partitions to reduce int partitions; - /// Number of lements between each partition + /// Number of elements between each partition int64_t partition_stride; - /// leading dimension of 'w'orksace operand + /// leading dimension of 'w'orkspace operand int64_t ldw; /// leading dimension of 's'ource operand diff --git a/tools/library/scripts/generator.py b/tools/library/scripts/generator.py index 401370d1..8f64de9c 100644 --- a/tools/library/scripts/generator.py +++ b/tools/library/scripts/generator.py @@ -89,7 +89,7 @@ def CreateGemmOperator(manifest, layouts, tile_descriptions, data_type, \ return operations -# Generates 3.0 API based GemmUniversal API kernels. Alignment constraits are folded in with layouts +# Generates 3.0 API based GemmUniversal API kernels. Alignment constraints are folded in with layouts def CreateGemmUniversal3xOperator( manifest, layouts, tile_descriptions, data_type, complex_transforms=None, @@ -4831,7 +4831,7 @@ def GenerateSM90(manifest, cuda_version): parser.add_argument("--architectures", default='53;60;61;70;75;80', help="Target compute architectures") parser.add_argument("--kernels", default='', help='Comma delimited list to filter kernels by name.') parser.add_argument("--ignore-kernels", default='', help='Comma delimited list of kernels to exclude from build.') - parser.add_argument("--filter-by-cc", default='True', type=str, help='If enabled, kernels whose comupte capability range is not satisfied by the build target are excluded.') + parser.add_argument("--filter-by-cc", default='True', type=str, help='If enabled, kernels whose compute capability range is not satisfied by the build target are excluded.') parser.add_argument("--cuda-version", default="11.0.0", help="Semantic version string of CUDA Toolkit") parser.add_argument('--kernel-filter-file', type=str, default=None, required=False, help='Full path of filter file') parser.add_argument('--selected-kernel-list', type=str, default=None, required=False, diff --git a/tools/library/scripts/pycutlass/README.md b/tools/library/scripts/pycutlass/README.md index dd2e7d0e..8d0dbaef 100644 --- a/tools/library/scripts/pycutlass/README.md +++ b/tools/library/scripts/pycutlass/README.md @@ -83,7 +83,7 @@ The tiling size of above operations can also be customized. ### Using Docker We recommend using one of our provided Docker images for using PyCUTLASS. -**To run CUTLASS 3 GEMM kernels targetting the NVIDIA Hopper architecture via PyCUTLASS,** you can use an included [Dockerfile](docker/Dockerfile-cuda12.0) based on the NGC CUDA 12.0 container: +**To run CUTLASS 3 GEMM kernels targeting the NVIDIA Hopper architecture via PyCUTLASS,** you can use an included [Dockerfile](docker/Dockerfile-cuda12.0) based on the NGC CUDA 12.0 container: ```shell docker build -t pycutlass-cuda12.0:latest -f docker/Dockerfile-cuda12.0 . docker run --gpus all -it --rm pycutlass-cuda12.0:latest @@ -91,7 +91,7 @@ docker run --gpus all -it --rm pycutlass-cuda12.0:latest Note that this Docker container does not include CuPy or PyTorch, and, thus, will not be able to run PyCUTLASS examples that leverage these packages. -**To run CUTLASS 2.x kernels targetting pre-SM90 architectures via PyCUTLASS,** you can use an included [Dockerfile](docker/Dockerfile-cuda11.8-pytorch) based on an NGC PyTorch container: +**To run CUTLASS 2.x kernels targeting pre-SM90 architectures via PyCUTLASS,** you can use an included [Dockerfile](docker/Dockerfile-cuda11.8-pytorch) based on an NGC PyTorch container: ```shell docker build -t pycutlass-cuda11.8-pytorch:latest -f docker/Dockerfile-cuda11.8-pytorch . docker run --gpus all -it --rm pycutlass-cuda11.8-pytorch:latest diff --git a/tools/library/scripts/pycutlass/docs/source/md/basic_idea.md b/tools/library/scripts/pycutlass/docs/source/md/basic_idea.md index 655caa39..a417afd9 100644 --- a/tools/library/scripts/pycutlass/docs/source/md/basic_idea.md +++ b/tools/library/scripts/pycutlass/docs/source/md/basic_idea.md @@ -10,7 +10,7 @@ PyCUTLASS handles the following things when launch the CUTLASS kernels ## Memory management -PyCUTLASS uses [RMM](https://github.com/rapidsai/rmm) to manage device memory. At the begining of the program, call +PyCUTLASS uses [RMM](https://github.com/rapidsai/rmm) to manage device memory. At the beginning of the program, call ```python pycutlass.get_memory_pool({init_pool_size_in_bytes}, {max_pool_size_in_bytes}) ``` @@ -41,7 +41,7 @@ The `{instruction_shape}` and `{opclass}` defines the instruction size and type. |Opclass | element_a/element_b | element_acc | instruction_shape | math_operation | | -- | -- | -- | -- | -- | | cutlass.OpClass.TensorOp | cutlass.float64 | cutlass.float64 | [8, 8, 4] | MathOperation.multiply_add| -| | cutass.float32 cutlass.tfloat32, cutlass.float16 cutlass.bfloat16 | cutlass.float32 | [16, 8, 8] | MathOperation.multiply_add MathOperation.multiply_add_fast_f32 MathOperation.multiply_add_fast_f16 MathOperation.multiply_add_fast_bf16 | +| | cutlass.float32 cutlass.tfloat32, cutlass.float16 cutlass.bfloat16 | cutlass.float32 | [16, 8, 8] | MathOperation.multiply_add MathOperation.multiply_add_fast_f32 MathOperation.multiply_add_fast_f16 MathOperation.multiply_add_fast_bf16 | | | cutlass.float16 | cutlass.float16/cutlass.float32|[16, 8, 16]| MathOperation.multiply_add | | | cutlass.bfloat_16 | cutlass.float32 | [16, 8, 16]|MathOperation.multiply_add | | | cutlass.int8 | cutlass.int32 | [16, 8, 32] | MathOperation.multiply_add_saturate| @@ -178,7 +178,7 @@ import pycutlass pycutlass.compiler.add_module([operation,]) ``` -Several operations can be compiled togather. The `nvcc` at `$CUDA_INSTALL_PATH/bin` is used by default as the compiler backend. But you can also switch to [CUDA Python](https://nvidia.github.io/cuda-python/overview.html)'s `nvrtc` with +Several operations can be compiled together. The `nvcc` at `$CUDA_INSTALL_PATH/bin` is used by default as the compiler backend. But you can also switch to [CUDA Python](https://nvidia.github.io/cuda-python/overview.html)'s `nvrtc` with ```python pycutlass.compiler.nvrtc() ``` @@ -249,7 +249,7 @@ arguments = Conv2dArguments( * `split_k_mode`: currently we support `cutlass.conv.SplitKMode.Serial` and `cutlass.conv.SplitKMode.Parallel`. * `split_k_slice`: number of split-k slices -For ordianry conv2d, just use `cutlass.conv.SplitKMode.Serial` with `split_k_slice=1`. +For ordinary conv2d, just use `cutlass.conv.SplitKMode.Serial` with `split_k_slice=1`. ### Getting output_op The way to create output_op is listed below diff --git a/tools/library/scripts/pycutlass/src/cpp/include/arch.h b/tools/library/scripts/pycutlass/src/cpp/include/arch.h index 21f97713..93a313d5 100644 --- a/tools/library/scripts/pycutlass/src/cpp/include/arch.h +++ b/tools/library/scripts/pycutlass/src/cpp/include/arch.h @@ -51,9 +51,9 @@ void bind_opcode(py::module &m) { .value("Simt", cutlass::OpcodeClass::kSimt, R"pbdoc(Tag classifying math operators as thread-level operations)pbdoc") .value("TensorOp", cutlass::OpcodeClass::kTensorOp, - R"pbdoc(Tag classifing operators as Tensor Core operations)pbdoc") + R"pbdoc(Tag classifying operators as Tensor Core operations)pbdoc") .value("WmmaTensorOp", cutlass::OpcodeClass::kWmmaTensorOp, - R"pbdoc(Tag classifing operators as WMMA Tensor Core operations)pbdoc") + R"pbdoc(Tag classifying operators as WMMA Tensor Core operations)pbdoc") .value("SparseTensorOp", cutlass::OpcodeClass::kSparseTensorOp, - R"pbdoc(Tag classifing operators as sparseTensor Core operations)pbdoc"); + R"pbdoc(Tag classifying operators as sparseTensor Core operations)pbdoc"); } diff --git a/tools/library/scripts/pycutlass/src/cpp/include/conv/conv_problem_size.h b/tools/library/scripts/pycutlass/src/cpp/include/conv/conv_problem_size.h index ab4a067f..cfc6e64e 100644 --- a/tools/library/scripts/pycutlass/src/cpp/include/conv/conv_problem_size.h +++ b/tools/library/scripts/pycutlass/src/cpp/include/conv/conv_problem_size.h @@ -42,7 +42,7 @@ namespace py = pybind11; void bind_conv_problem_size(py::module &m) { // // Conv2d Problem Size: - // include/cutlass/conv/conv2d_problem_sizd.h + // include/cutlass/conv/conv2d_problem_size.h // py::class_(m, "Conv2dProblemSize") // constructors diff --git a/tools/library/scripts/pycutlass/src/cpp/include/epilogue/epilogue_visitor_op/visitor_op_binary.h b/tools/library/scripts/pycutlass/src/cpp/include/epilogue/epilogue_visitor_op/visitor_op_binary.h index d9fa4458..7486e56f 100644 --- a/tools/library/scripts/pycutlass/src/cpp/include/epilogue/epilogue_visitor_op/visitor_op_binary.h +++ b/tools/library/scripts/pycutlass/src/cpp/include/epilogue/epilogue_visitor_op/visitor_op_binary.h @@ -87,7 +87,7 @@ class VisitorOpBinary{ using BinaryOp = BinaryOp_; static_assert(kElementsPerAccess==VisitAccessTypeA::kElements, "kElementsPerAccess mismatches with Visitor A"); - static_assert(kElementsPerAccess==VisitAccessTypeB::kElements, "kElementsPerAccess misnatches with Visitor B"); + static_assert(kElementsPerAccess==VisitAccessTypeB::kElements, "kElementsPerAccess mismatches with Visitor B"); /// SMEM buffer class required in the epilogue visitor struct SharedStorage { diff --git a/tools/library/scripts/pycutlass/src/cpp/include/epilogue/epilogue_visitor_op/visitor_op_column_reduction.h b/tools/library/scripts/pycutlass/src/cpp/include/epilogue/epilogue_visitor_op/visitor_op_column_reduction.h index 624d7e68..289119c9 100644 --- a/tools/library/scripts/pycutlass/src/cpp/include/epilogue/epilogue_visitor_op/visitor_op_column_reduction.h +++ b/tools/library/scripts/pycutlass/src/cpp/include/epilogue/epilogue_visitor_op/visitor_op_column_reduction.h @@ -55,7 +55,7 @@ template < typename ElementReduction_, ///< Data type of the output reduction in device memory typename ElementReductionAccumulator_ , ///< Data type to accumulate reduction in smem and register typename OutputTileIterator_, ///< Tile Iterator type - typename Visitor_ ///< preceeding visitor op + typename Visitor_ ///< preceding visitor op > class VisitorOpColumnReduction { public: @@ -83,7 +83,7 @@ class VisitorOpColumnReduction { /// Fragment type of accumulator using AccumulatorAccessType = Array; - /// Fragment type of redcution + /// Fragment type of reduction using ReductionAccumulatorAccessType = Array; /// Thread map used by output tile iterators diff --git a/tools/library/scripts/pycutlass/src/cpp/include/epilogue/epilogue_visitor_op/visitor_op_linear_combination.h b/tools/library/scripts/pycutlass/src/cpp/include/epilogue/epilogue_visitor_op/visitor_op_linear_combination.h index 1e2b8e61..259656e7 100644 --- a/tools/library/scripts/pycutlass/src/cpp/include/epilogue/epilogue_visitor_op/visitor_op_linear_combination.h +++ b/tools/library/scripts/pycutlass/src/cpp/include/epilogue/epilogue_visitor_op/visitor_op_linear_combination.h @@ -86,7 +86,7 @@ class VisitorOpLinearCombination{ using CombinationOp = cutlass::plus; static_assert(kElementsPerAccess==VisitAccessTypeA::kElements, "kElementsPerAccess mismatches with Visitor A"); - static_assert(kElementsPerAccess==VisitAccessTypeB::kElements, "kElementsPerAccess misnatches with Visitor B"); + static_assert(kElementsPerAccess==VisitAccessTypeB::kElements, "kElementsPerAccess mismatches with Visitor B"); /// SMEM buffer class required in the epilogue visitor struct SharedStorage { diff --git a/tools/library/scripts/pycutlass/src/cpp/include/epilogue/epilogue_visitor_op/visitor_op_row_reduction.h b/tools/library/scripts/pycutlass/src/cpp/include/epilogue/epilogue_visitor_op/visitor_op_row_reduction.h index 27b03f84..72f65c52 100644 --- a/tools/library/scripts/pycutlass/src/cpp/include/epilogue/epilogue_visitor_op/visitor_op_row_reduction.h +++ b/tools/library/scripts/pycutlass/src/cpp/include/epilogue/epilogue_visitor_op/visitor_op_row_reduction.h @@ -56,7 +56,7 @@ template < typename ElementReduction_, ///< Data type of the output reduction in device memory typename ElementReductionAccumulator_ , ///< Data type to accumulate reduction in smem and register typename OutputTileIterator_, ///< Tile Iterator type - typename Visitor_ ///< preceeding visitor op + typename Visitor_ ///< preceding visitor op > class VisitorOpRowReduction { public: @@ -82,7 +82,7 @@ class VisitorOpRowReduction { /// Fragment type of accumulator using AccumulatorAccessType = Array; - /// Fragment type of redcution + /// Fragment type of reduction using ReductionAccumulatorAccessType = Array; /// Thread map used by output tile iterators diff --git a/tools/library/scripts/pycutlass/src/cpp/include/gemm/gemm.h b/tools/library/scripts/pycutlass/src/cpp/include/gemm/gemm.h index 36987b5d..6eb6abfd 100644 --- a/tools/library/scripts/pycutlass/src/cpp/include/gemm/gemm.h +++ b/tools/library/scripts/pycutlass/src/cpp/include/gemm/gemm.h @@ -52,7 +52,7 @@ void bind_gemm(py::module &m) { .value("Array", cutlass::gemm::GemmUniversalMode::kArray) .value("Invalid", cutlass::gemm::GemmUniversalMode::kInvalid); - /// GemmCoord is a structure that specifies a location within the coordiate space of a GEMM problem + /// GemmCoord is a structure that specifies a location within the coordinate space of a GEMM problem py::class_(m, "GemmCoord") .def(py::init()) .def("m", py::overload_cast<>(&cutlass::gemm::GemmCoord::m)) diff --git a/tools/library/scripts/pycutlass/src/cpp/include/swizzling.h b/tools/library/scripts/pycutlass/src/cpp/include/swizzling.h index 43991e46..970cd6d3 100644 --- a/tools/library/scripts/pycutlass/src/cpp/include/swizzling.h +++ b/tools/library/scripts/pycutlass/src/cpp/include/swizzling.h @@ -88,7 +88,7 @@ void bind_identity_swizzle(py::module & m, std::string name) { R"pbdoc(Computes CUDA grid dimensions given a size in units of logical tiles)pbdoc") .def("tag", [](const T & swizzle){ return demangle(typeid(T).name()); - }, R"pbdoc(Returns the c++ name of the swizzling for code emittion)pbdoc"); + }, R"pbdoc(Returns the c++ name of the swizzling for code emission)pbdoc"); } template @@ -109,7 +109,7 @@ void bind_swizzle(py::module & m, std::string name, std::string doc) { R"pbdoc(Computes CUDA grid dimensions given a size in units of logical tiles)pbdoc") .def("tag", [](const T & swizzle){ return demangle(typeid(T).name()); - }, R"pbdoc(Returns the c++ name of the swizzling for code emittion)pbdoc"); + }, R"pbdoc(Returns the c++ name of the swizzling for code emission)pbdoc"); } template @@ -132,7 +132,7 @@ void bind_dgrad_swizzle(py::module & m, std::string name) { R"pbdoc(Computes CUDA grid dimensions given a size in units of logical tiles)pbdoc") .def("tag", [](const T & swizzle){ return demangle(typeid(T).name()); - }, R"pbdoc(Returns the c++ name of the swizzling for code emittion)pbdoc"); + }, R"pbdoc(Returns the c++ name of the swizzling for code emission)pbdoc"); } void bind_threadblock_swizzle(py::module &m) { diff --git a/tools/library/scripts/pycutlass/src/cpp/include/tensor_ref_view.h b/tools/library/scripts/pycutlass/src/cpp/include/tensor_ref_view.h index 09a4adde..f99d6339 100644 --- a/tools/library/scripts/pycutlass/src/cpp/include/tensor_ref_view.h +++ b/tools/library/scripts/pycutlass/src/cpp/include/tensor_ref_view.h @@ -20,7 +20,7 @@ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSE cuda.CUresult: diff --git a/tools/library/scripts/pycutlass/src/pycutlass/test/conv2d_testbed.py b/tools/library/scripts/pycutlass/src/pycutlass/test/conv2d_testbed.py index 63ae6da9..43f2cee5 100644 --- a/tools/library/scripts/pycutlass/src/pycutlass/test/conv2d_testbed.py +++ b/tools/library/scripts/pycutlass/src/pycutlass/test/conv2d_testbed.py @@ -498,7 +498,7 @@ def run(self, problem_size, split_k_mode=cutlass.conv.SplitKMode.Serial, ######################################################################################################## # TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference # TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes -# Additionaly, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes +# Additionally, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes # (conv_blacklist_sizes) ############################################################################################################ diff --git a/tools/library/scripts/pycutlass/src/pycutlass/test/gemm_testbed.py b/tools/library/scripts/pycutlass/src/pycutlass/test/gemm_testbed.py index 4fb46c1f..ab3ae5ad 100644 --- a/tools/library/scripts/pycutlass/src/pycutlass/test/gemm_testbed.py +++ b/tools/library/scripts/pycutlass/src/pycutlass/test/gemm_testbed.py @@ -489,7 +489,7 @@ def test_all_gemm(operation: 'GemmOperationUniversal', testcase="universal"): else: alignment = 128 // minimum_operand_element_size - # int8_t gemm alignment constrainst + # int8_t gemm alignment constraints if opcode_class == cutlass.OpClass.Simt and operation.A.element == cutlass.int8 and operation.A.layout == cutlass.ColumnMajor: alignment_m = 4 else: diff --git a/tools/library/scripts/pycutlass/src/pycutlass/test/utils.py b/tools/library/scripts/pycutlass/src/pycutlass/test/utils.py index 55281bec..f1a25f92 100644 --- a/tools/library/scripts/pycutlass/src/pycutlass/test/utils.py +++ b/tools/library/scripts/pycutlass/src/pycutlass/test/utils.py @@ -63,7 +63,7 @@ def get_name(layouts, alignments, element_output, Generates a procedural name for a test case. :param layouts: indexable container of layouts of A, B, and C operands - :param alignments: indexable container of alingments of A, B, and C operands + :param alignments: indexable container of alignments of A, B, and C operands :param element_output: data type of the output element :param element_accumulator: data type used in accumulation :param element_epilogue: data type used in computing the epilogue diff --git a/tools/library/scripts/pycutlass/test/gemm/gemm_bf16_sm90.py b/tools/library/scripts/pycutlass/test/gemm/gemm_bf16_sm90.py index 9237326a..8d91979e 100644 --- a/tools/library/scripts/pycutlass/test/gemm/gemm_bf16_sm90.py +++ b/tools/library/scripts/pycutlass/test/gemm/gemm_bf16_sm90.py @@ -52,7 +52,7 @@ def add_test(cls, layouts, alignments, element_output, element_accumulator, elem :param cls: class to which the generated method will be added :type cls: type :param layouts: indexable container of layouts of A, B, and C operands - :param alignments: indexable container of alingments of A, B, and C operands + :param alignments: indexable container of alignments of A, B, and C operands :param element_output: data type of the output element :param element_accumulator: data type used in accumulation :param element_epilogue: data type used in computing the epilogue diff --git a/tools/library/scripts/pycutlass/test/gemm/gemm_f16_sm90.py b/tools/library/scripts/pycutlass/test/gemm/gemm_f16_sm90.py index 81540b35..79339cae 100644 --- a/tools/library/scripts/pycutlass/test/gemm/gemm_f16_sm90.py +++ b/tools/library/scripts/pycutlass/test/gemm/gemm_f16_sm90.py @@ -42,7 +42,7 @@ from pycutlass.utils.device import device_cc -# Partial specialziation for naming tests +# Partial specialization for naming tests name_fn = partial(get_name, element_a=cutlass.float16, element_b=cutlass.float16, arch=90) @@ -54,7 +54,7 @@ def add_test(cls, layouts, alignments, element_output, element_accumulator, elem :param cls: class to which the generated method will be added :type cls: type :param layouts: indexable container of layouts of A, B, and C operands - :param alignments: indexable container of alingments of A, B, and C operands + :param alignments: indexable container of alignments of A, B, and C operands :param element_output: data type of the output element :param element_accumulator: data type used in accumulation :param element_epilogue: data type used in computing the epilogue diff --git a/tools/library/scripts/pycutlass/test/gemm/gemm_f64_sm90.py b/tools/library/scripts/pycutlass/test/gemm/gemm_f64_sm90.py index 4140ed4a..d4d6fdc1 100644 --- a/tools/library/scripts/pycutlass/test/gemm/gemm_f64_sm90.py +++ b/tools/library/scripts/pycutlass/test/gemm/gemm_f64_sm90.py @@ -52,7 +52,7 @@ def add_test(cls, layouts, alignments, element_output, element_accumulator, elem :param cls: class to which the generated method will be added :type cls: type :param layouts: indexable container of layouts of A, B, and C operands - :param alignments: indexable container of alingments of A, B, and C operands + :param alignments: indexable container of alignments of A, B, and C operands :param element_output: data type of the output element :param element_accumulator: data type used in accumulation :param element_epilogue: data type used in computing the epilogue diff --git a/tools/library/scripts/pycutlass/test/gemm/gemm_s8_sm90.py b/tools/library/scripts/pycutlass/test/gemm/gemm_s8_sm90.py index e06d538f..682ab7d5 100644 --- a/tools/library/scripts/pycutlass/test/gemm/gemm_s8_sm90.py +++ b/tools/library/scripts/pycutlass/test/gemm/gemm_s8_sm90.py @@ -52,7 +52,7 @@ def add_test(cls, layouts, alignments, element_output, element_accumulator, elem :param cls: class to which the generated method will be added :type cls: type :param layouts: indexable container of layouts of A, B, and C operands - :param alignments: indexable container of alingments of A, B, and C operands + :param alignments: indexable container of alignments of A, B, and C operands :param element_output: data type of the output element :param element_accumulator: data type used in accumulation :param element_epilogue: data type used in computing the epilogue diff --git a/tools/library/src/conv2d_operation.h b/tools/library/src/conv2d_operation.h index 5d06e721..d1b1e841 100644 --- a/tools/library/src/conv2d_operation.h +++ b/tools/library/src/conv2d_operation.h @@ -358,7 +358,7 @@ class Conv2dOperation : public Conv2dOperationBase { << operator_args.problem_size << std::endl << " split_k_mode: " << (operator_args.split_k_mode == cutlass::conv::SplitKMode::kSerial ? "serial" : "parallel") << std::endl - << " epilouge (alpha, beta): " + << " epilogue (alpha, beta): " << operator_args.output_op.alpha << ", " << operator_args.output_op.beta << std::endl << " ref_A (ptr, {stride}): " @@ -610,7 +610,7 @@ class DirectConv2dOperation : public Conv2dOperation { << operator_args.problem_size << std::endl << " split_k_mode: " << (operator_args.split_k_mode == cutlass::conv::SplitKMode::kSerial ? "serial" : "parallel") << std::endl - << " epilouge (alpha, beta): " + << " epilogue (alpha, beta): " << operator_args.output_op.alpha << ", " << operator_args.output_op.beta << std::endl << " ref_A (ptr, {stride}): " diff --git a/tools/library/src/conv3d_operation.h b/tools/library/src/conv3d_operation.h index 0e2a1c6b..a7a0bace 100644 --- a/tools/library/src/conv3d_operation.h +++ b/tools/library/src/conv3d_operation.h @@ -349,7 +349,7 @@ class Conv3dOperation : public Conv3dOperationBase { << operator_args.problem_size << std::endl << " split_k_mode: " << (operator_args.split_k_mode == cutlass::conv::SplitKMode::kSerial ? "serial" : "parallel") << std::endl - << " epilouge (alpha, beta): " + << " epilogue (alpha, beta): " << operator_args.output_op.alpha << ", " << operator_args.output_op.beta << std::endl << " ref_A (ptr, {stride}): " diff --git a/tools/library/src/handle.cu b/tools/library/src/handle.cu index fdfe2516..90f61126 100644 --- a/tools/library/src/handle.cu +++ b/tools/library/src/handle.cu @@ -908,13 +908,13 @@ Status Handle::gemm_planar_complex_array( NumericTypeID element_C, /// Data type of C and D matrix void const * const * ptr_C_real, /// Pointer to array containing pointers to real part of C matrices - void const * const * ptr_C_imag, /// Pointer to array containing poitners to imaginary part of C matrices + void const * const * ptr_C_imag, /// Pointer to array containing pointers to imaginary part of C matrices int64_t ldc_real, /// Leading dimension of real part of C matrix int64_t ldc_imag, /// Leading dimension of imaginary part of C matrix void * const * ptr_D_real, /// Pointer to array containing pointers to real part of D matrices - void * const * ptr_D_imag, /// Pointer to array containing poitners to imaginary part of D matrices + void * const * ptr_D_imag, /// Pointer to array containing pointers to imaginary part of D matrices int64_t ldd_real, /// Leading dimension of real part of D matrix int64_t ldd_imag /// Leading dimension of imaginary part of D matrix diff --git a/tools/library/src/rank_2k_operation.h b/tools/library/src/rank_2k_operation.h index d6e0dcae..901f2ea6 100644 --- a/tools/library/src/rank_2k_operation.h +++ b/tools/library/src/rank_2k_operation.h @@ -347,7 +347,7 @@ class Rank2KOperation : public Rank2KOperationBase { std::cout << "Rank2KOperation::OperatorArguments" << std::endl << " problem_size:" << std::endl << operator_args.problem_size << std::endl - << " epilouge (alpha, beta): " + << " epilogue (alpha, beta): " << operator_args.epilogue.alpha << ", " << operator_args.epilogue.beta << std::endl << " ref_A (ptr, {stride}): " diff --git a/tools/library/src/reduction/reduction_operation.h b/tools/library/src/reduction/reduction_operation.h index 846ca02e..4a6909cc 100644 --- a/tools/library/src/reduction/reduction_operation.h +++ b/tools/library/src/reduction/reduction_operation.h @@ -266,7 +266,7 @@ class ReductionOperation : public Operation { << operator_args.partitions << std::endl << " partition_stride: " << operator_args.partition_stride << std::endl - << " epilouge (alpha, beta): " + << " epilogue (alpha, beta): " << operator_args.output.alpha << ", " << operator_args.output.beta << std::endl << " workspace (ptr, stride): " diff --git a/tools/library/src/reference/conv_reference_operation.h b/tools/library/src/reference/conv_reference_operation.h index 3a294a23..0b108c2b 100644 --- a/tools/library/src/reference/conv_reference_operation.h +++ b/tools/library/src/reference/conv_reference_operation.h @@ -74,7 +74,7 @@ template < > struct ConvReferenceDispatcher; -/// Dispatcher for Conv2d (partially specialied for kConvDim == 2) +/// Dispatcher for Conv2d (partially specialized for kConvDim == 2) template < Provider kProvider, conv::Operator kConvolutionalOperator, diff --git a/tools/library/src/symm_operation.h b/tools/library/src/symm_operation.h index d7554edd..59bb5214 100644 --- a/tools/library/src/symm_operation.h +++ b/tools/library/src/symm_operation.h @@ -353,7 +353,7 @@ class SymmOperation : public SymmOperationBase { std::cout << "SymmOperation::OperatorArguments" << std::endl << " problem_size:" << std::endl << operator_args.problem_size << std::endl - << " epilouge (alpha, beta): " + << " epilogue (alpha, beta): " << operator_args.epilogue.alpha << ", " << operator_args.epilogue.beta << std::endl << " ref_A (ptr, {stride}): " diff --git a/tools/library/src/util.cu b/tools/library/src/util.cu index a4e234ab..36334576 100644 --- a/tools/library/src/util.cu +++ b/tools/library/src/util.cu @@ -428,7 +428,7 @@ static struct { NumericTypeID enumerant; } NumericTypeID_enumerants[] = { - {"unknown", "", NumericTypeID::kUnknown}, + {"unknown", "", NumericTypeID::kUnknown}, {"void", "Void", NumericTypeID::kVoid}, {"b1", "B1", NumericTypeID::kB1}, {"u2", "U2", NumericTypeID::kU2}, @@ -465,7 +465,7 @@ NumericTypeID_enumerants[] = { {"cs16", "CS16", NumericTypeID::kCS16}, {"cs32", "CS32", NumericTypeID::kCS32}, {"cs64", "CS64", NumericTypeID::kCS64}, - {"*", "", NumericTypeID::kUnknown} + {"*", "", NumericTypeID::kUnknown} }; /// Converts a NumericTypeID enumerant to a string @@ -958,7 +958,7 @@ static struct { ConvKind enumerant; } ConvKind_enumerants[] = { - {"unknown", "", ConvKind::kUnknown}, + {"unknown", "", ConvKind::kUnknown}, {"fprop", "", ConvKind::kFprop}, {"dgrad", "", ConvKind::kDgrad}, {"wgrad", "", ConvKind::kWgrad}, diff --git a/tools/profiler/src/conv2d_operation_profiler.cu b/tools/profiler/src/conv2d_operation_profiler.cu index 0693058b..dfbce274 100644 --- a/tools/profiler/src/conv2d_operation_profiler.cu +++ b/tools/profiler/src/conv2d_operation_profiler.cu @@ -270,7 +270,7 @@ Status Conv2dOperationProfiler::initialize_configuration( } //////////////////////// Convolution output dimensions p and q //////////////////////// - // Cutlass convolutions support arbitrary output sizes and not constriant by // + // Cutlass convolutions support arbitrary output sizes and not constrained by // // input, filter, padding, striding, dilation sizes. // // cuDNN sets the output dimensions (p, q) using following equations: // // // @@ -502,7 +502,7 @@ void Conv2dOperationProfiler::initialize_result_( // Bytes of activation, filter, and output tensors result.bytes = problem_.bytes(operation_desc); - // Theoritical flops required for the computation + // Theoretical flops required for the computation result.flops = problem_.flops(operation_desc); // Measured runtime @@ -510,7 +510,7 @@ void Conv2dOperationProfiler::initialize_result_( } -/// Initialize reduction problem dimenstions and library::Operation +/// Initialize reduction problem dimensions and library::Operation bool Conv2dOperationProfiler::initialize_reduction_configuration_( Options const &options, PerformanceReport &report, @@ -535,7 +535,7 @@ bool Conv2dOperationProfiler::initialize_reduction_configuration_( /// This chooses the appropriate stride element of the row-major C tensor. int const & tensor_c_stride_idx = (conv_kind == library::ConvKind::kWgrad ? 2 : 0); - /// intialize library::ReductionConfiguration + /// initialize library::ReductionConfiguration conv_workspace_.reduction_configuration.problem_size = problem_.eq_gemm_size(conv_kind).mn(); conv_workspace_.reduction_configuration.partitions = int(problem_.split_k_slices); conv_workspace_.reduction_configuration.partition_stride = problem_.eq_gemm_size(conv_kind).mn().product(); @@ -773,7 +773,7 @@ bool Conv2dOperationProfiler::verify_cutlass( conv_workspace_.arguments.alpha = problem_.alpha_one.data(); conv_workspace_.arguments.beta = problem_.beta_zero.data(); - /// intialize library::ReductionArguments + /// initialize library::ReductionArguments conv_workspace_.reduction_arguments.workspace = conv_workspace_.device_workspace.data(); conv_workspace_.reduction_arguments.source = conv_workspace_.C->data(); conv_workspace_.reduction_arguments.destination = conv_workspace_.Computed->data(); @@ -961,7 +961,7 @@ bool Conv2dOperationProfiler::verify_with_host_reference_( conv_desc.tile_description.math_instruction.element_accumulator, conv_desc.element_epilogue); -#if 0 // debug print to check which host refererence instance is selected +#if 0 // debug print to check which host reference instance is selected std::cout << conv2d_key << "\n"; #endif @@ -982,7 +982,7 @@ bool Conv2dOperationProfiler::verify_with_host_reference_( return true; } - // host refernce has only one instances in Conv2dOperationVectorMap + // host reference has only one instances in Conv2dOperationVectorMap library::Operation const *reference_op = cc_it->second[0]; // @@ -1009,7 +1009,7 @@ bool Conv2dOperationProfiler::verify_with_host_reference_( conv_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost; // - // Intialize host reference operation + // Initialize host reference operation // std::vector host_workspace_reference_op; @@ -1114,11 +1114,11 @@ bool Conv2dOperationProfiler::verify_with_device_reference_( return true; } - // device refernce has only one instances in Conv2dOperationVectorMap + // device reference has only one instances in Conv2dOperationVectorMap library::Operation const *reference_op = cc_it->second[0]; // - // Intialize device reference operation + // Initialize device reference operation // std::vector host_workspace_reference_op; @@ -1205,7 +1205,7 @@ bool Conv2dOperationProfiler::profile( conv_workspace_.arguments.alpha = problem_.alpha_one.data(); conv_workspace_.arguments.beta = problem_.beta_zero.data(); - /// intialize library::ReductionArguments + /// initialize library::ReductionArguments conv_workspace_.reduction_arguments.workspace = conv_workspace_.device_workspace.data(); conv_workspace_.reduction_arguments.source = conv_workspace_.C->data(); conv_workspace_.reduction_arguments.destination = conv_workspace_.Computed->data(); @@ -1276,7 +1276,7 @@ Status Conv2dOperationProfiler::profile_cutlass_( // update library::ConvArguments for parallel split-k reduction conv_arguments->D = conv_workspace_.device_workspace.data(); - /// intialize library::ReductionArguments + /// initialize library::ReductionArguments conv_workspace_.reduction_arguments.workspace = conv_workspace_.device_workspace.data(); conv_workspace_.reduction_arguments.source = conv_workspace_.C->batch_data(problem_idx); conv_workspace_.reduction_arguments.destination = conv_workspace_.Computed->batch_data(problem_idx); @@ -1329,7 +1329,7 @@ Status Conv2dOperationProfiler::profile_cutlass_( // update library::ConvArguments for parallel split-k reduction conv_arguments->D = conv_workspace_.device_workspace.data(); - /// intialize library::ReductionArguments + /// initialize library::ReductionArguments conv_workspace_.reduction_arguments.workspace = conv_workspace_.device_workspace.data(); conv_workspace_.reduction_arguments.source = conv_workspace_.C->batch_data(problem_idx); conv_workspace_.reduction_arguments.destination = conv_workspace_.Computed->batch_data(problem_idx); diff --git a/tools/profiler/src/conv2d_operation_profiler.h b/tools/profiler/src/conv2d_operation_profiler.h index f432c7e5..8b285ecb 100644 --- a/tools/profiler/src/conv2d_operation_profiler.h +++ b/tools/profiler/src/conv2d_operation_profiler.h @@ -189,7 +189,7 @@ class Conv2dOperationProfiler : public OperationProfiler { } } - // Returns leading dimenstion for equivalent gemm matrix A + // Returns leading dimension for equivalent gemm matrix A int64_t eq_gemm_lda(library::ConvKind const &conv_kind) const { switch (conv_kind) { @@ -200,7 +200,7 @@ class Conv2dOperationProfiler : public OperationProfiler { } } - // Returns leading dimenstion for equivalent gemm matrix B + // Returns leading dimension for equivalent gemm matrix B int64_t eq_gemm_ldb(library::ConvKind const &conv_kind) const { switch (conv_kind) { @@ -211,7 +211,7 @@ class Conv2dOperationProfiler : public OperationProfiler { } } - // Returns leading dimenstion for equivalent gemm matrix C + // Returns leading dimension for equivalent gemm matrix C int64_t eq_gemm_ldc(library::ConvKind const &conv_kind) const { switch (conv_kind) { @@ -436,7 +436,7 @@ class Conv2dOperationProfiler : public OperationProfiler { void *device_workspace); - /// Initialize reduction problem dimenstions and library::Operation + /// Initialize reduction problem dimensions and library::Operation bool initialize_reduction_configuration_( Options const &options, PerformanceReport &report, diff --git a/tools/profiler/src/conv3d_operation_profiler.cu b/tools/profiler/src/conv3d_operation_profiler.cu index 34fee856..da9c3653 100644 --- a/tools/profiler/src/conv3d_operation_profiler.cu +++ b/tools/profiler/src/conv3d_operation_profiler.cu @@ -284,7 +284,7 @@ Status Conv3dOperationProfiler::initialize_configuration( } //////////////////////// Convolution output dimensions p and q //////////////////////// - // Cutlass convolutions support arbitrary output sizes and not constriant by // + // Cutlass convolutions support arbitrary output sizes and not constrained by // // input, filter, padding, striding, dilation sizes. // // cuDNN sets the output dimensions (p, q) using following equations: // // // @@ -545,7 +545,7 @@ void Conv3dOperationProfiler::initialize_result_( // Bytes of activation, filter, and output tensors result.bytes = problem_.bytes(operation_desc); - // Theoritical flops required for the computation + // Theoretical flops required for the computation result.flops = problem_.flops(operation_desc); // Measured runtime @@ -553,7 +553,7 @@ void Conv3dOperationProfiler::initialize_result_( } -/// Initialize reduction problem dimenstions and library::Operation +/// Initialize reduction problem dimensions and library::Operation bool Conv3dOperationProfiler::initialize_reduction_configuration_( Options const &options, PerformanceReport &report, @@ -578,7 +578,7 @@ bool Conv3dOperationProfiler::initialize_reduction_configuration_( /// This chooses the appropriate stride element of the row-major C tensor. int const & tensor_c_stride_idx = (conv_kind == library::ConvKind::kWgrad ? 3 : 0); - /// intialize library::ReductionConfiguration + /// initialize library::ReductionConfiguration conv_workspace_.reduction_configuration.problem_size = problem_.eq_gemm_size(conv_kind).mn(); conv_workspace_.reduction_configuration.partitions = int(problem_.split_k_slices); conv_workspace_.reduction_configuration.partition_stride = problem_.eq_gemm_size(conv_kind).mn().product(); @@ -947,7 +947,7 @@ bool Conv3dOperationProfiler::verify_with_host_reference_( conv_desc.tile_description.math_instruction.element_accumulator, conv_desc.element_epilogue); -#if 0 // debug print to check which host refererence instance is selected +#if 0 // debug print to check which host reference instance is selected std::cout << conv_key << "\n"; #endif @@ -968,7 +968,7 @@ bool Conv3dOperationProfiler::verify_with_host_reference_( return true; } - // host refernce has only one instances in ConvOperationVectorMap + // host reference has only one instances in ConvOperationVectorMap library::Operation const *reference_op = cc_it->second[0]; // @@ -993,7 +993,7 @@ bool Conv3dOperationProfiler::verify_with_host_reference_( conv_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost; // - // Intialize host reference operation + // Initialize host reference operation // std::vector host_workspace_reference_op; @@ -1109,7 +1109,7 @@ void Conv3dOperationProfiler::set_cutlass_operator_arguments_(int problem_idx) { conv_workspace_.arguments.alpha = problem_.alpha_one.data(); conv_workspace_.arguments.beta = problem_.beta_zero.data(); - /// intialize library::ReductionArguments + /// initialize library::ReductionArguments conv_workspace_.reduction_arguments.workspace = conv_workspace_.device_workspace.data(); conv_workspace_.reduction_arguments.source = conv_workspace_.C->batch_data(problem_idx); conv_workspace_.reduction_arguments.destination = conv_workspace_.Computed->batch_data(problem_idx); diff --git a/tools/profiler/src/conv3d_operation_profiler.h b/tools/profiler/src/conv3d_operation_profiler.h index aba832ec..4205d561 100644 --- a/tools/profiler/src/conv3d_operation_profiler.h +++ b/tools/profiler/src/conv3d_operation_profiler.h @@ -105,7 +105,7 @@ class Conv3dOperationProfiler : public OperationProfiler { /// Total number of flops computed int64_t flops(library::ConvDescription const &operation_desc) const; - /// Infers output size from theinput size, padding, stride, and dilation + /// Infers output size from the input size, padding, stride, and dilation void set_default_output_size() { z = ((d + pad_d - t * dilation_d) / stride_d) + 1; p = ((h + pad_h - r * dilation_h) / stride_h) + 1; @@ -190,7 +190,7 @@ class Conv3dOperationProfiler : public OperationProfiler { } } - // Returns leading dimenstion for equivalent gemm matrix A + // Returns leading dimension for equivalent gemm matrix A int64_t eq_gemm_lda(library::ConvKind const &conv_kind) const { switch (conv_kind) { @@ -201,7 +201,7 @@ class Conv3dOperationProfiler : public OperationProfiler { } } - // Returns leading dimenstion for equivalent gemm matrix B + // Returns leading dimension for equivalent gemm matrix B int64_t eq_gemm_ldb(library::ConvKind const &conv_kind) const { switch (conv_kind) { @@ -212,7 +212,7 @@ class Conv3dOperationProfiler : public OperationProfiler { } } - // Returns leading dimenstion for equivalent gemm matrix C + // Returns leading dimension for equivalent gemm matrix C int64_t eq_gemm_ldc(library::ConvKind const &conv_kind) const { switch (conv_kind) { @@ -389,7 +389,7 @@ class Conv3dOperationProfiler : public OperationProfiler { void *host_workspace, void *device_workspace); - /// Initialize reduction problem dimenstions and library::Operation + /// Initialize reduction problem dimensions and library::Operation bool initialize_reduction_configuration_( Options const &options, PerformanceReport &report, diff --git a/tools/profiler/src/cublas_helpers.cu b/tools/profiler/src/cublas_helpers.cu index 5f7354cf..2175b359 100644 --- a/tools/profiler/src/cublas_helpers.cu +++ b/tools/profiler/src/cublas_helpers.cu @@ -57,7 +57,7 @@ Status get_cutlass_status(cublasStatus_t cublas) { return Status::kErrorInternal; } -/// Converts a cuBLASS status to cutlass::profiler::Disposition +/// Converts a cuBLAS status to cutlass::profiler::Disposition Disposition get_cutlass_disposition(cublasStatus_t cublas_status) { if (cublas_status == CUBLAS_STATUS_INVALID_VALUE) { diff --git a/tools/profiler/src/cublas_helpers.h b/tools/profiler/src/cublas_helpers.h index 8c36fb7b..3f38adbc 100644 --- a/tools/profiler/src/cublas_helpers.h +++ b/tools/profiler/src/cublas_helpers.h @@ -54,7 +54,7 @@ namespace profiler { /// Converts a cuBLAS status to cutlass::Status Status get_cutlass_status(cublasStatus_t cublas); -/// Converts a cuBLASS status to cutlass::profiler::Disposition +/// Converts a cuBLAS status to cutlass::profiler::Disposition Disposition get_cutlass_disposition(cublasStatus_t cublas_status); /// Maps a CUTLASS tensor layout to a cuBLAS transpose operation @@ -87,7 +87,7 @@ Status cublas_satisfies(library::SymmDescription const &desc); /// This is a helper class to create cublasHandle_t automatically on CublasCreate object creation and /// to destroy cublasHandle_t on CublasCreate object destruction. -/// Additionaly, it provides implicit cast from CublasCreate's object to cublasHandle_t's object +/// Additionally, it provides implicit cast from CublasCreate's object to cublasHandle_t's object class CublasCreate { private: cublasHandle_t handle; @@ -196,7 +196,7 @@ struct cublasGemmExDispatcher { library::GemmUniversalConfiguration configuration; library::GemmUniversalArguments arguments; - // cublass-specific data structures to fill cublas API call arguments + // cublas-specific data structures to fill cublas API call arguments cublasOperation_t trans_A; cublasOperation_t trans_B; cudaDataType_t data_type_A; @@ -237,7 +237,7 @@ struct cublasRankKDispatcher { library::RankKConfiguration configuration; library::RankKArguments arguments; - // cublass-specific data structures to fill cublas API call arguments + // cublas-specific data structures to fill cublas API call arguments cublasOperation_t trans_A; cublasFillMode_t uplo; cudaDataType_t data_type_A; @@ -277,7 +277,7 @@ struct cublasTrmmDispatcher { library::TrmmConfiguration configuration; library::TrmmArguments arguments; - // cublass-specific data structures to fill cublas API call arguments + // cublas-specific data structures to fill cublas API call arguments cublasOperation_t trans_A; cublasSideMode_t side; cublasFillMode_t uplo; @@ -318,7 +318,7 @@ struct cublasSymmDispatcher { library::SymmConfiguration configuration; library::SymmArguments arguments; - // cublass-specific data structures to fill cublas API call arguments + // cublas-specific data structures to fill cublas API call arguments cublasSideMode_t side; cublasFillMode_t uplo; cudaDataType_t data_type_A; diff --git a/tools/profiler/src/cudnn_helpers.cpp b/tools/profiler/src/cudnn_helpers.cpp index 69522794..844119d1 100644 --- a/tools/profiler/src/cudnn_helpers.cpp +++ b/tools/profiler/src/cudnn_helpers.cpp @@ -68,7 +68,7 @@ Disposition get_cutlass_disposition(cudnnStatus_t cudnn_status) { return Disposition::kFailed; } -/// Checks cudnnStatus_t converts to cutlas status and returns if Status::kSuccess o.w. throws exception +/// Checks cudnnStatus_t converts to cutlass status and returns if Status::kSuccess o.w. throws exception Status checkCudnnErr(cudnnStatus_t cudnn_status) { Status cutlass_status = get_cutlass_status(cudnn_status); if(cutlass_status != Status::kSuccess) { @@ -243,7 +243,7 @@ Status cudnn_satisfies( } //////////////////////// Convolution output dimensions p and q /////////////////////// - // Cutlass convolutions support arbitrary output dimensions and not constriant by // + // Cutlass convolutions support arbitrary output dimensions and not constrained by // // input, filter, padding, striding, dilation sizes. // // cuDNN sets the output dimensions (p, q) using following equations: // // // @@ -373,7 +373,7 @@ Status cudnn_satisfies( } //////////////////////// Convolution output dimensions p and q /////////////////////// - // Cutlass convolutions support arbitrary output dimensions and not constriant by // + // Cutlass convolutions support arbitrary output dimensions and not constrained by // // input, filter, padding, striding, dilation sizes. // // cuDNN sets the output dimensions (p, q) using following equations: // // // diff --git a/tools/profiler/src/cudnn_helpers.h b/tools/profiler/src/cudnn_helpers.h index 2f023825..e1c4f644 100644 --- a/tools/profiler/src/cudnn_helpers.h +++ b/tools/profiler/src/cudnn_helpers.h @@ -55,7 +55,7 @@ Status get_cutlass_status(cudnnStatus_t cudnn_status); /// Converts a cuDNN status to cutlass::profiler::Disposition Disposition get_cutlass_disposition(cudnnStatus_t cudnn_status); -/// Checks cudnnStatus_t converts to cutlas status and returns if Status::kSuccess o.w. throws exception +/// Checks cudnnStatus_t converts to cutlass status and returns if Status::kSuccess o.w. throws exception Status checkCudnnErr(cudnnStatus_t cudnn_status); /// Maps a CUTLASS conv mode to a cuDNN conv mode enumeration @@ -82,7 +82,7 @@ float cast_cudnn_compute_type_to_float(library::NumericTypeID type, void const * /// This is a helper class to create cudnnHandle_t automatically on CudnnCreate object creation and /// to destroy cudnnHandle_t on CudnnCreate object destruction. -/// Additionaly, it provides implicit cast from CudnnCreate's object to cudnnHandle_t's object +/// Additionally, it provides implicit cast from CudnnCreate's object to cudnnHandle_t's object class CudnnCreate { private: cudnnHandle_t handle; @@ -162,7 +162,7 @@ struct cudnnConvDispatcher { // Methods // - // TODO: unify ctor cudnnConvDispatcher for conv2d and conv3d by unifying Conv2dConfigration + // TODO: unify ctor cudnnConvDispatcher for conv2d and conv3d by unifying Conv2dConfiguration // ctor for conv2d cudnnConvDispatcher( @@ -496,7 +496,7 @@ struct cudnnConvDispatcher { workspace = cutlass::device_memory::allocation(workspace_size_in_bytes); } - /// Executes Conv2d operater from cudnn library + /// Executes Conv2d operator from cudnn library cudnnStatus_t operator()(cudnnHandle_t handle) { switch (conv_kind) { @@ -552,7 +552,7 @@ struct cudnnConvDispatcher { } } - // Returns Actviation Tensor + // Returns Activation Tensor void const * activation() const { switch(conv_kind) { case library::ConvKind::kFprop : return arguments.A; diff --git a/tools/profiler/src/debug.h b/tools/profiler/src/debug.h index 83e2c338..cd80c46f 100644 --- a/tools/profiler/src/debug.h +++ b/tools/profiler/src/debug.h @@ -39,7 +39,7 @@ //#define report(x) { std::cout << "\033[31m" << __FILE__ << ":" << __LINE__ << " " << x << "\033[0m" << std::endl; } //#define report(x) {} -// Enable/Disble Profiler debug prints +// Enable/Disable Profiler debug prints //#define DEBUG_PROFILER //RED 31m // profiler prints debug messages in red diff --git a/tools/profiler/src/device_allocation.cu b/tools/profiler/src/device_allocation.cu index e59c3447..92679ef5 100644 --- a/tools/profiler/src/device_allocation.cu +++ b/tools/profiler/src/device_allocation.cu @@ -442,12 +442,12 @@ int DeviceAllocation::batch_count() const { return batch_count_; } -/// Gets the stride (in units of elements) beteween items +/// Gets the stride (in units of elements) between items int64_t DeviceAllocation::batch_stride() const { return batch_stride_; } -/// Gets the stride (in units of bytes) beteween items +/// Gets the stride (in units of bytes) between items int64_t DeviceAllocation::batch_stride_bytes() const { return bytes(type_, batch_stride_); } diff --git a/tools/profiler/src/device_allocation.h b/tools/profiler/src/device_allocation.h index d0bdfd45..f1362e76 100644 --- a/tools/profiler/src/device_allocation.h +++ b/tools/profiler/src/device_allocation.h @@ -176,10 +176,10 @@ class DeviceAllocation { /// Gets the number of adjacent tensors in memory int batch_count() const; - /// Gets the stride (in units of elements) beteween items + /// Gets the stride (in units of elements) between items int64_t batch_stride() const; - /// Gets the stride (in units of bytes) beteween items + /// Gets the stride (in units of bytes) between items int64_t batch_stride_bytes() const; /// Capacity of allocation in number of elements diff --git a/tools/profiler/src/gemm_operation_profiler.cu b/tools/profiler/src/gemm_operation_profiler.cu index 4b15fda5..0924c033 100644 --- a/tools/profiler/src/gemm_operation_profiler.cu +++ b/tools/profiler/src/gemm_operation_profiler.cu @@ -108,7 +108,7 @@ void GemmOperationProfiler::print_examples(std::ostream &out) const { << "Run when A is f16 with column-major and B is any datatype with row-major (For column major, use column, col, or n. For row major use, row or t):\n" << " $ cutlass_profiler --operation=Gemm --A=f16:column --B=*:row\n\n" - << "Profile a particular problem size with split K and paralell reduction:\n" + << "Profile a particular problem size with split K and parallel reduction:\n" << " $ cutlass_profiler --operation=Gemm --split_k_mode=parallel --split_k_slices=2 --m=1024 --n=1024 --k=128\n\n" << "Using various input value distribution:\n" @@ -168,7 +168,7 @@ Status GemmOperationProfiler::GemmProblem::parse( } if (!arg_as_SplitKModeID(this->split_k_mode, "split_k_mode", problem_space, problem)) { - // defualt value + // default value this->split_k_mode = library::SplitKMode::kSerial; } @@ -405,7 +405,7 @@ void GemmOperationProfiler::initialize_result_( } -/// Initialize redution problem dimentions and library::Operation +/// Initialize reduction problem dimensions and library::Operation bool GemmOperationProfiler::initialize_reduction_configuration_( library::Operation const *operation, ProblemSpace::Problem const &problem) { @@ -434,7 +434,7 @@ bool GemmOperationProfiler::initialize_reduction_configuration_( gemm_desc.tile_description.math_instruction.element_accumulator, // element workspace gemm_desc.tile_description.math_instruction.element_accumulator, // element accumulator gemm_desc.C.element, // element output - gemm_desc.element_epilogue // element coumpute + gemm_desc.element_epilogue // element compute ); auto reduction_it = library::Singleton::get().operation_table.reduction_operations.find(reduction_key); diff --git a/tools/profiler/src/gpu_timer.cpp b/tools/profiler/src/gpu_timer.cpp index e2397586..14f0d91b 100644 --- a/tools/profiler/src/gpu_timer.cpp +++ b/tools/profiler/src/gpu_timer.cpp @@ -94,7 +94,7 @@ void GpuTimer::stop_and_wait(cudaStream_t stream) { } } -/// Returns the duration in miliseconds +/// Returns the duration in milliseconds double GpuTimer::duration(int iterations) const { float avg_ms; diff --git a/tools/profiler/src/gpu_timer.h b/tools/profiler/src/gpu_timer.h index d8bce957..a3d3befd 100644 --- a/tools/profiler/src/gpu_timer.h +++ b/tools/profiler/src/gpu_timer.h @@ -62,7 +62,7 @@ struct GpuTimer { /// Records a stop event in the stream and synchronizes on the stream void stop_and_wait(cudaStream_t stream = nullptr); - /// Returns the duration in miliseconds + /// Returns the duration in milliseconds double duration(int iterations = 1) const; }; diff --git a/tools/profiler/src/operation_profiler.h b/tools/profiler/src/operation_profiler.h index a2b0bdd9..17b4413c 100644 --- a/tools/profiler/src/operation_profiler.h +++ b/tools/profiler/src/operation_profiler.h @@ -81,7 +81,7 @@ class OperationProfiler { /// List of providers used to verify and compare each result ProviderVector verification_providers_; - /// Model performance result initailized by the operation profiler with workload statistics + /// Model performance result initialized by the operation profiler with workload statistics /// and reasonable default state. PerformanceResult model_result_; diff --git a/tools/profiler/src/options.cu b/tools/profiler/src/options.cu index ea79a9d7..3401d15b 100644 --- a/tools/profiler/src/options.cu +++ b/tools/profiler/src/options.cu @@ -189,7 +189,7 @@ Options::Initialization::Initialization(cutlass::CommandLine const &cmdline) { enabled = false; } else if (provider != library::Provider::kReferenceHost && provider != library::Provider::kReferenceDevice) { - throw std::runtime_error("Unsupported intialization provider specified."); + throw std::runtime_error("Unsupported initialization provider specified."); } } else { @@ -205,7 +205,7 @@ Options::Initialization::Initialization(cutlass::CommandLine const &cmdline) { get_distribution(cmdline, "dist", data_distribution); } else { - // profiler choosen data distribution (allowed to change based on numeric types) + // profiler chosen data distribution (allowed to change based on numeric types) fix_data_distribution = false; // set uniform data distribution with range [-4, 4] data_distribution.set_uniform(-4, 4, 0); diff --git a/tools/profiler/src/options.h b/tools/profiler/src/options.h index 02edd9ac..eba0172f 100644 --- a/tools/profiler/src/options.h +++ b/tools/profiler/src/options.h @@ -231,7 +231,7 @@ class Options { std::vector> pivot_tags; /// If true, reports status of all kernels including those that were - /// not run for the given argumetns + /// not run for the given arguments bool report_not_run; /// Prints human-readable text to stdout. If false, nothing is written to stdout diff --git a/tools/profiler/src/problem_space.h b/tools/profiler/src/problem_space.h index 4e102e64..8ec65ca8 100644 --- a/tools/profiler/src/problem_space.h +++ b/tools/profiler/src/problem_space.h @@ -284,7 +284,7 @@ struct ScalarArgument : public KernelArgument { // Data members // - /// Set of posible values + /// Set of possible values ValueCollection values; // @@ -540,7 +540,7 @@ struct IntegerArgument : public KernelArgument { // Data members // - /// Set of posible values + /// Set of possible values RangeCollection ranges; // diff --git a/tools/util/include/cutlass/util/command_line.h b/tools/util/include/cutlass/util/command_line.h index 65cf9a1a..9b6738d9 100644 --- a/tools/util/include/cutlass/util/command_line.h +++ b/tools/util/include/cutlass/util/command_line.h @@ -188,7 +188,7 @@ struct CommandLine { for (int i = 0; i < keys.size(); ++i) { if (keys[i] == string(arg_name)) { string val_string(values[i]); - seperate_string(val_string, vals, sep); + separate_string(val_string, vals, sep); } } } @@ -225,7 +225,7 @@ struct CommandLine { range != ranges.end(); ++range) { std::vector range_vals; - seperate_string(*range, range_vals, sep); + separate_string(*range, range_vals, sep); vals.push_back(range_vals); } } @@ -283,7 +283,7 @@ struct CommandLine { } template - static void seperate_string(std::string const& str, + static void separate_string(std::string const& str, std::vector& vals, char sep = ',') { std::istringstream str_stream(str); diff --git a/tools/util/include/cutlass/util/device_groupnorm.h b/tools/util/include/cutlass/util/device_groupnorm.h index aaa19b2d..5b78aa64 100644 --- a/tools/util/include/cutlass/util/device_groupnorm.h +++ b/tools/util/include/cutlass/util/device_groupnorm.h @@ -314,7 +314,7 @@ __global__ void groupnorm_twopass_multiple_load(T* output, } //ref_input & ref_output should be [N, H, W, C] -//ref_gamma & ref_beta shoud be [1, 1, 1, C] +//ref_gamma & ref_beta should be [1, 1, 1, C] template void groupnorm(cutlass::Tensor4DCoord input_size, const int num_groups, diff --git a/tools/util/include/cutlass/util/device_nhwc_padding.h b/tools/util/include/cutlass/util/device_nhwc_padding.h index 86e5fa77..c489d7d1 100644 --- a/tools/util/include/cutlass/util/device_nhwc_padding.h +++ b/tools/util/include/cutlass/util/device_nhwc_padding.h @@ -109,9 +109,9 @@ __global__ void nhwc_padding_channel_3To4_kernel(const int32_t n, shm[threadIdx.x] = tidx >= max_input_element ? zero_io : input[tidx]; __syncthreads(); - const int ouput_offset = blockIdx.x * 256; - const int lower_bound = max_output_element < ouput_offset + 256 ? max_output_element : ouput_offset + 256; - for (int i = ouput_offset + threadidx, j = threadidx ; i < lower_bound ; i+=192, j+=192) + const int output_offset = blockIdx.x * 256; + const int lower_bound = max_output_element < output_offset + 256 ? max_output_element : output_offset + 256; + for (int i = output_offset + threadidx, j = threadidx ; i < lower_bound ; i+=192, j+=192) { const Telement* shm_element = (const Telement*)shm + j*3*element_in_Tio/4; Telement array[element_in_Tio]; @@ -140,9 +140,9 @@ __global__ void nhwc_padding_channel_3To8_kernel(const int32_t n, shm[threadIdx.x] = tidx >= max_input_element ? zero_io : input[tidx]; __syncthreads(); - const int ouput_offset = blockIdx.x * 512; - const int lower_bound = max_output_element < ouput_offset + 512 ? max_output_element : ouput_offset + 512; - for (int i = ouput_offset + threadidx, j = threadidx ; i < lower_bound ; i+=192, j+=192) + const int output_offset = blockIdx.x * 512; + const int lower_bound = max_output_element < output_offset + 512 ? max_output_element : output_offset + 512; + for (int i = output_offset + threadidx, j = threadidx ; i < lower_bound ; i+=192, j+=192) { const Telement* shm_element = (const Telement*)shm + (element_in_Tio == 4 ? j/2 : j)*3; Telement array[element_in_Tio]; diff --git a/tools/util/include/cutlass/util/helper_cuda.hpp b/tools/util/include/cutlass/util/helper_cuda.hpp index 15e0bc85..d840db56 100644 --- a/tools/util/include/cutlass/util/helper_cuda.hpp +++ b/tools/util/include/cutlass/util/helper_cuda.hpp @@ -74,7 +74,7 @@ _ConvertSMVer2Cores(int major, int minor) // Defines for GPU Architecture types (using the SM version to determine // the # of cores per SM typedef struct { - int SM; // 0xMm (hexidecimal notation), M = SM Major version, + int SM; // 0xMm (hexadecimal notation), M = SM Major version, // and m = SM minor version int Cores; } sSMtoCores; diff --git a/tools/util/include/cutlass/util/reference/device/gemm.h b/tools/util/include/cutlass/util/reference/device/gemm.h index 1850c2f9..a083bd14 100644 --- a/tools/util/include/cutlass/util/reference/device/gemm.h +++ b/tools/util/include/cutlass/util/reference/device/gemm.h @@ -248,7 +248,7 @@ struct Gemm diff --git a/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h b/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h index ea5359f7..d294258b 100644 --- a/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h +++ b/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h @@ -72,7 +72,7 @@ struct TensorForEachHelper { template struct TensorForEachHelper { - /// Constructor for fastest chaning rank + /// Constructor for fastest changing rank __inline__ __device__ TensorForEachHelper(Func &func, Coord const &size, Coord &coord, int64_t index) { diff --git a/tools/util/include/cutlass/util/reference/device/tensor_fill.h b/tools/util/include/cutlass/util/reference/device/tensor_fill.h index 8568e47c..b4238a0a 100644 --- a/tools/util/include/cutlass/util/reference/device/tensor_fill.h +++ b/tools/util/include/cutlass/util/reference/device/tensor_fill.h @@ -1308,7 +1308,7 @@ void TensorFill( /////////////////////////////////////////////////////////////////////////////////////////////////// -/// Fills a tensor's digonal with 1 and 0 everywhere else. +/// Fills a tensor's diagonal with 1 and 0 everywhere else. template < typename Element, ///< Element type typename Layout> ///< Layout function diff --git a/tools/util/include/cutlass/util/reference/device/tensor_foreach.h b/tools/util/include/cutlass/util/reference/device/tensor_foreach.h index cac558df..bb6f935e 100644 --- a/tools/util/include/cutlass/util/reference/device/tensor_foreach.h +++ b/tools/util/include/cutlass/util/reference/device/tensor_foreach.h @@ -133,4 +133,4 @@ struct BlockForEach { } // namespace device } // namespace reference -} // namesace cutlass +} // namespace cutlass diff --git a/tools/util/include/cutlass/util/reference/host/gemm.h b/tools/util/include/cutlass/util/reference/host/gemm.h index cd87e6f7..f70e0699 100644 --- a/tools/util/include/cutlass/util/reference/host/gemm.h +++ b/tools/util/include/cutlass/util/reference/host/gemm.h @@ -335,7 +335,7 @@ struct Gemm diff --git a/tools/util/include/cutlass/util/reference/host/tensor_fill.h b/tools/util/include/cutlass/util/reference/host/tensor_fill.h index a8b938d1..3db176ed 100644 --- a/tools/util/include/cutlass/util/reference/host/tensor_fill.h +++ b/tools/util/include/cutlass/util/reference/host/tensor_fill.h @@ -992,7 +992,7 @@ void TensorFillDiagonal( /////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////// -/// Helper to fill a tensor's digonal with 1 and 0 everywhere else. +/// Helper to fill a tensor's diagonal with 1 and 0 everywhere else. template < typename Element, ///< Element type typename Layout> ///< Layout function diff --git a/tools/util/include/cutlass/util/reference/host/tensor_foreach.h b/tools/util/include/cutlass/util/reference/host/tensor_foreach.h index a195893b..68a36d86 100644 --- a/tools/util/include/cutlass/util/reference/host/tensor_foreach.h +++ b/tools/util/include/cutlass/util/reference/host/tensor_foreach.h @@ -69,7 +69,7 @@ struct TensorForEachHelper { /// Index of the active rank static int const kActiveRank = Rank - 1; - /// Constructor for fastest chaning rank + /// Constructor for fastest changing rank TensorForEachHelper( Func &func, Coord const &extent,