diff --git a/README.md b/README.md index dc7cde32..79c11b3e 100644 --- a/README.md +++ b/README.md @@ -328,7 +328,7 @@ or a subset of kernels for NVIDIA Ampere and Turing architecture: ### Building a subset Tensor Core GEMM kernels -To compile a subset of Tensor Core GEMM kernels with FP32 accumulation and FP16 input targetting NVIDIA Ampere and Turing architecture, +To compile a subset of Tensor Core GEMM kernels with FP32 accumulation and FP16 input targeting NVIDIA Ampere and Turing architecture, use the below cmake command line: ```bash $ cmake .. -DCUTLASS_NVCC_ARCHS='75;80' -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_s*gemm_f16_*_nt_align8 @@ -376,7 +376,7 @@ reference_device: Passed ### Building one CUDA Core GEMM kernel -To compile one SGEMM kernel targetting NVIDIA Ampere and Turing architecture, use the below cmake command line: +To compile one SGEMM kernel targeting NVIDIA Ampere and Turing architecture, use the below cmake command line: ```bash $ cmake .. -DCUTLASS_NVCC_ARCHS='75;80' -DCUTLASS_LIBRARY_KERNELS=cutlass_simt_sgemm_128x128_8x2_nn_align1 ... @@ -418,7 +418,7 @@ $ ./tools/profiler/cutlass_profiler --kernels=sgemm --m=3456 --n=4096 --k=4096 ### Building a subset of Tensor Core Convolution kernels To compile a subset of Tensor core convolution kernels implementing forward propagation (fprop) with FP32 accumulation -and FP16 input targetting NVIDIA Ampere and Turing architecture, use the below cmake command line: +and FP16 input targeting NVIDIA Ampere and Turing architecture, use the below cmake command line: ```bash $ cmake .. -DCUTLASS_NVCC_ARCHS='75;80' -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_s*fprop_optimized_f16 ... @@ -466,7 +466,7 @@ reference_device: Passed ### Building one Convolution CUDA kernel To compile and run one CUDA Core convolution kernel implementing forward propagation (fprop) with F32 accumulation -and FP32 input targetting NVIDIA Ampere and Turing architecture, use the below cmake command line: +and FP32 input targeting NVIDIA Ampere and Turing architecture, use the below cmake command line: ```bash $ cmake .. -DCUTLASS_NVCC_ARCHS='75;80' -DCUTLASS_LIBRARY_KERNELS=cutlass_simt_sfprop_optimized_128x128_8x2_nhwc ... diff --git a/docs/annotated.html b/docs/annotated.html index 43923cc2..233691c2 100644 --- a/docs/annotated.html +++ b/docs/annotated.html @@ -280,15 +280,15 @@
- + - + - + @@ -594,7 +594,7 @@ - + @@ -620,7 +620,7 @@ - + diff --git a/docs/classcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_0c9bb6f4463ab6085e6008b5d5ad6abfd.html b/docs/classcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_0c9bb6f4463ab6085e6008b5d5ad6abfd.html index d53d8d67..6800f4fe 100644 --- a/docs/classcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_0c9bb6f4463ab6085e6008b5d5ad6abfd.html +++ b/docs/classcutlass_1_1gemm_1_1device_1_1GemmBatched_3_01ElementA___00_01LayoutA___00_01ElementB___00_0c9bb6f4463ab6085e6008b5d5ad6abfd.html @@ -108,7 +108,7 @@Parital specialization for column-major output exchanges problem size and operand. +
Partial specialization for column-major output exchanges problem size and operand.
#include <gemm_batched.h>
Parital specialization for column-major output exchanges problem size and operand. +
Partial specialization for column-major output exchanges problem size and operand.
#include <gemm_complex.h>
Parital specialization for column-major output exchanges problem size and operand. +
Partial specialization for column-major output exchanges problem size and operand.
#include <gemm.h>
@@ -247,7 +247,7 @@
| template<typename Element , typename Layout > | | Helper to fill a tensor's diagonal with 1 and 0 everywhere else. More... | | template<typename Element , typename Layout > | keys | cutlass::CommandLine | num_naked_args() const | cutlass::CommandLine | inline | separate_string(std::string const &str, std::vector< value_t > &vals, char sep= ',') | cutlass::CommandLine | inlinestatic | tokenize(std::vector< std::pair< std::string, std::string > > &tokens, std::string const &str, char delim= ',', char sep= ':') | cutlass::CommandLine | inlinestatic | tokenize(std::vector< std::string > &tokens, std::string const &str, char delim= ',', char sep= ':') | cutlass::CommandLine | inlinestatic | | Tokenizes a comma-delimited list of string pairs delimited by ':'. More... | | static void | separate_string (std::string const &str, std::vector< value_t > &vals, char sep= ',') | | |
@@ -548,7 +548,7 @@ |
static void cutlass::CommandLine::seperate_string | +static void cutlass::CommandLine::separate_string | ( | std::string const & | str, | diff --git a/docs/structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout660562b232f408218828ca5915b7e73a.html b/docs/structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout660562b232f408218828ca5915b7e73a.html index 37cb3e5d..2f4bf08e 100644 --- a/docs/structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout660562b232f408218828ca5915b7e73a.html +++ b/docs/structcutlass_1_1reference_1_1device_1_1Gemm_3_01ElementA_00_01LayoutA_00_01ElementB_00_01Layout660562b232f408218828ca5915b7e73a.html @@ -104,7 +104,7 @@||||||||||
Public Member Functions | ||||||||||||||
__inline__ __device__ | TensorForEachHelper (Func &func, Coord< Rank > const &size, Coord< Rank > &coord, int64_t index) | |||||||||||||
Constructor for fastest chaning rank. More... | ||||||||||||||
Constructor for fastest changing rank. More... | ||||||||||||||
Parital specialization for XOR-popc. +
Partial specialization for XOR-popc.
#include <gemm.h>
diff --git a/docs/tools_2util_2include_2cutlass_2util_2reference_2device_2gemm_8h.html b/docs/tools_2util_2include_2cutlass_2util_2reference_2device_2gemm_8h.html
index 2a0a978e..cc752855 100644
--- a/docs/tools_2util_2include_2cutlass_2util_2reference_2device_2gemm_8h.html
+++ b/docs/tools_2util_2include_2cutlass_2util_2reference_2device_2gemm_8h.html
@@ -134,7 +134,7 @@
| Partial specialization for multiply-add-saturate. More... | | | Partial specialization for XOR-popc. More... | | |
diff --git a/docs/tools_2util_2include_2cutlass_2util_2reference_2host_2gemm_8h.html b/docs/tools_2util_2include_2cutlass_2util_2reference_2host_2gemm_8h.html
index d20a0784..b0bfdbc2 100644
--- a/docs/tools_2util_2include_2cutlass_2util_2reference_2host_2gemm_8h.html
+++ b/docs/tools_2util_2include_2cutlass_2util_2reference_2host_2gemm_8h.html
@@ -141,7 +141,7 @@
| Partial specialization for multiply-add-saturate. More... | | | Partial specialization for XOR-popc. More... | | |
diff --git a/docs/wmma__sm75_8h_source.html b/docs/wmma__sm75_8h_source.html
index 72ad72f9..6ff6405d 100644
--- a/docs/wmma__sm75_8h_source.html
+++ b/docs/wmma__sm75_8h_source.html
@@ -98,7 +98,7 @@
1 /*************************************************************************************************** 16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 17 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE 19 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 24 **************************************************************************************************/ 110 static_assert(false, "wmma.mma.sync interger type multiplicands is avialable only for SM75 and beyond"); 190 static_assert(false, "wmma.mma.sync interger type multiplicands is avialable only for SM75 and beyond"); Definition: aligned_buffer.h:35 1 /*************************************************************************************************** 16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 17 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE 19 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 24 **************************************************************************************************/ 110 static_assert(false, "wmma.mma.sync interger type multiplicands is avialable only for SM75 and beyond"); 190 static_assert(false, "wmma.mma.sync interger type multiplicands is avialable only for SM75 and beyond"); Definition: aligned_buffer.h:35 integer_subbyte< 1, false > uint1b_t 1-bit Unsigned integer type Definition: integer_subbyte.h:152 |