From 749a8d3fca69839bfa7136be94049005d071b226 Mon Sep 17 00:00:00 2001
From: ANIKET SHIVAM <3268307+ANIKET-SHIVAM@users.noreply.github.com>
Date: Thu, 9 Mar 2023 12:27:40 -0800
Subject: [PATCH] Updates for 3.0 (#857)

Co-authored-by: Aniket Shivam <ashivam@nvidia.com>
---
 README.md                          |  4 +-
 include/cutlass/float8.h           | 12 ++---
 include/cutlass/functional.h       | 78 +++++++++++++++++++++---------
 include/cutlass/wmma_array.h       | 42 +++++++++++++++-
 media/docs/quickstart.md           |  2 +-
 tools/library/CMakeLists.txt       |  1 +
 tools/library/scripts/generator.py | 24 +++++++++
 7 files changed, 131 insertions(+), 32 deletions(-)
diff --git a/README.md b/README.md
index a89b8f49..dc7cde32 100644
--- a/README.md
+++ b/README.md
@@ -311,7 +311,9 @@ $ make cutlass_profiler -j16
 
 By default, only one tile size is instantiated for each data type, math instruction, and layout.
 To instantiate all, set the following environment variable when running CMake from an empty `build/` directory.
-Beware, this results in *thousands* of kernels and long build times.
+Beware, this results in *tens of thousands* of kernels and long build times. 
+This would also result in a large binary size and on some platforms linker to fail on building the library.
+Therefore, it's highly recommended to generate only a subset of kernels as demonstrated in the sub-section below.
 ```bash
 $ cmake .. -DCUTLASS_NVCC_ARCHS=90a -DCUTLASS_LIBRARY_KERNELS=all
 ...
diff --git a/include/cutlass/float8.h b/include/cutlass/float8.h
index 93e3209c..de32a218 100644
--- a/include/cutlass/float8.h
+++ b/include/cutlass/float8.h
@@ -399,7 +399,7 @@ struct alignas(1) float_e4m3_t : float8_base<FloatEncoding::E4M3> {
 
         return *reinterpret_cast<float_e4m3_t *>(&tmp);
     #else
-        return bitcast(Base::convert_float_to_fp8(float(flt)));
+        return bitcast(Base::convert_float_to_fp8(__half2float(flt)));
     #endif
     }
 
@@ -413,7 +413,7 @@ struct alignas(1) float_e4m3_t : float8_base<FloatEncoding::E4M3> {
 
         return reinterpret_cast<half2 const &>(packed).x;
     #else
-        return half(Base::convert_fp8_to_float(x.storage));
+        return __float2half(Base::convert_fp8_to_float(x.storage));
     #endif
     }
 
@@ -425,7 +425,7 @@ struct alignas(1) float_e4m3_t : float8_base<FloatEncoding::E4M3> {
         uint32_t packed;
         asm volatile("cvt.rn.f16x2.e4m3x2 %0, %1;\n" : "=r"(packed) : "h"(bits));
 
-        return float(reinterpret_cast<half2 const &>(packed).x);
+        return __half2float(reinterpret_cast<half2 const &>(packed).x);
     #else
         return Base::convert_fp8_to_float(x.storage);
     #endif
@@ -609,7 +609,7 @@ struct alignas(1) float_e5m2_t : float8_base<FloatEncoding::E5M2> {
 
         return *reinterpret_cast<float_e5m2_t *>(&tmp);
     #else
-        return bitcast(Base::convert_float_to_fp8(float(flt)));
+        return bitcast(Base::convert_float_to_fp8(__half2float(flt)));
     #endif
     }
 
@@ -623,7 +623,7 @@ struct alignas(1) float_e5m2_t : float8_base<FloatEncoding::E5M2> {
 
         return reinterpret_cast<half2 const &>(packed).x;
     #else
-        return half(Base::convert_fp8_to_float(x.storage));
+        return __float2half(Base::convert_fp8_to_float(x.storage));
     #endif
     }
 
@@ -635,7 +635,7 @@ struct alignas(1) float_e5m2_t : float8_base<FloatEncoding::E5M2> {
         uint32_t packed;
         asm volatile("cvt.rn.f16x2.e5m2x2 %0, %1;\n" : "=r"(packed) : "h"(bits));
 
-        return float(reinterpret_cast<half2 const &>(packed).x);
+        return __half2float(reinterpret_cast<half2 const &>(packed).x);
     #else
         return Base::convert_fp8_to_float(x.storage);
     #endif
diff --git a/include/cutlass/functional.h b/include/cutlass/functional.h
index 277bad5c..59aec46a 100644
--- a/include/cutlass/functional.h
+++ b/include/cutlass/functional.h
@@ -89,6 +89,59 @@ struct multiplies {
   }
 };
 
+#if defined(__CUDA_ARCH__)
+/// Partial specializations needed when __CUDA_NO_HALF2_OPERATORS__ is set
+template<>
+struct plus<__half2> {
+  CUTLASS_HOST_DEVICE
+  __half2 operator()(__half2 lhs, __half2 const &rhs) const {
+    return __hadd2(lhs, rhs);
+  }
+};
+
+template<>
+struct minus<__half2> {
+  CUTLASS_HOST_DEVICE
+  __half2 operator()(__half2 lhs, __half2 const &rhs) const {
+    return __hsub2(lhs, rhs);
+  }
+};
+
+template<>
+struct multiplies<__half2> {
+  CUTLASS_HOST_DEVICE
+  __half2 operator()(__half2 lhs, __half2 const &rhs) const {
+    return __hmul2(lhs, rhs);
+  }
+};
+
+/// Partial specializations needed when __CUDA_NO_HALF_OPERATORS__ is set
+template<>
+struct plus<__half> {
+  CUTLASS_HOST_DEVICE
+  __half operator()(__half lhs, __half const &rhs) const {
+    return __hadd(lhs, rhs);
+  }
+};
+
+template<>
+struct minus<__half> {
+  CUTLASS_HOST_DEVICE
+  __half operator()(__half lhs, __half const &rhs) const {
+    return __hsub(lhs, rhs);
+  }
+};
+
+template<>
+struct multiplies<__half> {
+  CUTLASS_HOST_DEVICE
+  __half operator()(__half lhs, __half const &rhs) const {
+    return __hmul(lhs, rhs);
+  }
+};
+#endif // defined(__CUDA_ARCH__)
+
+
 // Maximum with nan propogation
 // To propgate the NANs, the "max" of a two element that contains NaNs should also return a NaN 
 template <typename T>
@@ -411,36 +464,15 @@ struct red<half2>
   CUTLASS_DEVICE
   void operator()(half2 *ptr, const half2 &data)
   {
-#if !defined(__CUDA_ARCH__)
+#if !defined(__CUDA_ARCH__) || (defined(__CUDA_ARCH__)  && (__CUDA_ARCH__ < 600))
       CUTLASS_UNUSED(ptr);
       CUTLASS_UNUSED(data);
-#elif (__CUDA_ARCH__ >= 600)
+#else
 
     // Vector-2 atomic reduction requires .target sm_60 or higher
     uint32_t word = reinterpret_cast<const uint32_t&>(data);
     asm volatile ("red.gpu.global.add.noftz.f16x2 [%0], %1;\n" : : "l"(ptr), "r"(word));
 
-#else
-
-    // Use CAS loop
-    uint32_t *ptr_int = reinterpret_cast<uint32_t *>(ptr);
-    uint32_t old_int = *ptr_int;
-    uint32_t assumed_int;
-
-    do
-    {
-      half2 old = reinterpret_cast<half2&>(old_int);
-
-      half hi = __hadd(__high2half(old), __high2half(data));
-      half lo = __hadd(__low2half(old), __low2half(data));
-      half2 update = __halves2half2(hi, lo);
-      uint32_t update_int = reinterpret_cast<const uint32_t&>(update);
-
-      assumed_int = old_int;
-      old_int = atomicCAS(ptr_int, assumed_int, update_int);
-
-    } while (assumed_int != old_int);
-
 #endif // (__CUDA_ARCH__ >= 600)
   }
 };
diff --git a/include/cutlass/wmma_array.h b/include/cutlass/wmma_array.h
index 4a074b67..1366a43b 100644
--- a/include/cutlass/wmma_array.h
+++ b/include/cutlass/wmma_array.h
@@ -52,7 +52,10 @@ template <
   /// Element type
   typename T,
   /// Number of elements in the array
-  int N
+  int N,
+  /// Whether the element type of T is half_t or __half
+  bool IsHalfType = (platform::is_same<typename T::element_type, cutlass::half_t>::value ||
+                     platform::is_same<typename T::element_type, __half>::value)
 >
 class WmmaFragmentArray: public Array<T, N, true> {
 public:
@@ -80,7 +83,44 @@ class WmmaFragmentArray: public Array<T, N, true> {
 
     return *this;
   }
+};
+
+/// Partial specialization for the case in which T::element_type is
+/// half_t or __half. This is needed because the cast (typename T::element_type)0
+/// in the primary template flags as an error when __CUDA_NO_HALF_CONVERSIONS__
+/// is set.
+template <
+  /// Element type
+  typename T,
+  /// Number of elements in the array
+  int N
+>
+class WmmaFragmentArray<T, N, true>: public Array<T, N, true> {
+public:
+
+  /// Efficient clear method (override Array::clear())
+  CUTLASS_HOST_DEVICE
+  void clear()
+  {
+    for(int i = 0; i < Array<T, N, true>::kElements; i++)
+    {
+      nvcuda::wmma::fill_fragment((*this)[i], __float2half(0.f));
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  WmmaFragmentArray<T, N>& operator+=(const WmmaFragmentArray<T, N>& rhs)
+  {
+    using element_type = typename T::element_type;
+    plus<T> add;
+
+    for (int i = 0; i < Array<T, N, true>::kElements; i++)
+    {
+      (*this)[i] = add((*this)[i], rhs[i]);
+    }
 
+    return *this;
+  }
 };
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/media/docs/quickstart.md b/media/docs/quickstart.md
index f0d4d8a3..a320ad34 100644
--- a/media/docs/quickstart.md
+++ b/media/docs/quickstart.md
@@ -587,7 +587,7 @@ To instantiate all operations supporting all tile sizes, data types, and alignme
 ```bash
 $ cmake .. -DCUTLASS_NVCC_ARCHS='70;75;80' -DCUTLASS_LIBRARY_KERNELS=all
 ```
-The above command line generates about seven thousand kernels targetting NVIDIA Ampere, Turing, and Volta architectures. 
+The above command line generates about twenty thousand kernels targetting NVIDIA Ampere, Turing, and Volta architectures. 
 Compiling thousands of kernels for three different architectures is time consuming. Additionaly, this would also result 
 in a large binary size and on some platforms linker to fail on building the library.
 
diff --git a/tools/library/CMakeLists.txt b/tools/library/CMakeLists.txt
index 69e324a5..b523784a 100644
--- a/tools/library/CMakeLists.txt
+++ b/tools/library/CMakeLists.txt
@@ -100,6 +100,7 @@ execute_process(
     --kernels "${CUTLASS_LIBRARY_KERNELS}"
     --ignore-kernels "${CUTLASS_LIBRARY_IGNORE_KERNELS}"
     --cuda-version "${CUTLASS_GENERATOR_CUDA_COMPILER_VERSION}"
+    --log-level DEBUG
   RESULT_VARIABLE cutlass_lib_INSTANCE_GENERATION_RESULT
   OUTPUT_VARIABLE cutlass_lib_INSTANCE_GENERATION_OUTPUT
   OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/library_instance_generation.log
diff --git a/tools/library/scripts/generator.py b/tools/library/scripts/generator.py
index beb5b8ec..401370d1 100644
--- a/tools/library/scripts/generator.py
+++ b/tools/library/scripts/generator.py
@@ -8,6 +8,7 @@
 import os.path
 import shutil
 import argparse
+import logging
 
 from library import *
 from manifest import *
@@ -4838,8 +4839,30 @@ def GenerateSM90(manifest, cuda_version):
   parser.add_argument("--interface-dir", default=None, required=False, help="Interface header to kernels")
   parser.add_argument("--disable-full-archs-compilation", action="store_true", required=False, help="Disable compilation for every archs in --architectures")
 
+  def numeric_log_level(log_level: str) -> int:
+    """
+    Converts the string identifier of the log level into the numeric identifier used
+    in setting the log level
+
+    :param x: string representation of log level (e.g., 'INFO', 'DEBUG')
+    :type x: str
+
+    :return: numeric representation of log level
+    :rtype: int
+    """
+    numeric_level = getattr(logging, log_level.upper(), None)
+    if not isinstance(numeric_level, int):
+      raise ValueError(f'Invalid log level: {log_level}')
+    return numeric_level
+
+  parser.add_argument("--log-level", default='info', type=numeric_log_level, required=False,
+                      help='Logging level to be used by the generator script')
+
   args = parser.parse_args()
 
+  # Set the logging level based on the user-provided `--log-level` command-line option
+  logging.basicConfig(level=args.log_level)
+
   manifest = Manifest(args)
 
   GenerateSM50(manifest, args.cuda_version)
@@ -4849,6 +4872,7 @@ def GenerateSM90(manifest, cuda_version):
   GenerateSM75(manifest, args.cuda_version)
   GenerateSM80(manifest, args.cuda_version)
   GenerateSM90(manifest, args.cuda_version)
+
   if 'library' in args.generator_target.split(','):
     manifest.emit(GeneratorTarget.Library)