packages/ml/rapids/raft/patches.diff

diff --git a/cpp/cmake/modules/ConfigureCUDA.cmake b/cpp/cmake/modules/ConfigureCUDA.cmake
index 440c8c4f..a0f26315 100644
--- a/cpp/cmake/modules/ConfigureCUDA.cmake
+++ b/cpp/cmake/modules/ConfigureCUDA.cmake
@@ -20,7 +20,7 @@ if(DISABLE_DEPRECATION_WARNINGS)
 endif()
 
 if(CMAKE_COMPILER_IS_GNUCXX)
-    list(APPEND RAFT_CXX_FLAGS -Wall -Werror -Wno-unknown-pragmas -Wno-error=deprecated-declarations)
+    list(APPEND RAFT_CXX_FLAGS -Wall -Wno-unknown-pragmas -Wno-error=deprecated-declarations)
 endif()
 
 list(APPEND RAFT_CUDA_FLAGS --expt-extended-lambda --expt-relaxed-constexpr)
@@ -30,10 +30,10 @@ list(APPEND RAFT_CUDA_FLAGS "-DCUDA_API_PER_THREAD_DEFAULT_STREAM")
 list(APPEND RAFT_CUDA_FLAGS -Xfatbin=-compress-all)
 
 # set warnings as errors
-if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11.2.0)
-    list(APPEND RAFT_CUDA_FLAGS -Werror=all-warnings)
-endif()
-list(APPEND RAFT_CUDA_FLAGS -Xcompiler=-Wall,-Werror,-Wno-error=deprecated-declarations)
+#if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11.2.0)
+#    list(APPEND RAFT_CUDA_FLAGS -Werror=all-warnings)
+#endif()
+list(APPEND RAFT_CUDA_FLAGS -Xcompiler=-Wall,-Wno-error=deprecated-declarations)
 
 # Option to enable line info in CUDA device compilation to allow introspection when profiling / memchecking
 if(CUDA_ENABLE_LINEINFO)
diff --git a/cpp/include/raft/distance/detail/canberra.cuh b/cpp/include/raft/distance/detail/canberra.cuh
index 6be994b8..f28f214e 100644
--- a/cpp/include/raft/distance/detail/canberra.cuh
+++ b/cpp/include/raft/distance/detail/canberra.cuh
@@ -77,7 +77,7 @@ static void canberraImpl(const DataT* x,
     const auto add  = raft::myAbs(x) + raft::myAbs(y);
     // deal with potential for 0 in denominator by
     // forcing 1/0 instead
-    acc += ((add != 0) * diff / (add + (add == 0)));
+    acc += AccT((add != (AccT)0)) * (AccT)diff / AccT(add + AccT(add == (AccT)0));
   };
 
   // epilogue operation lambda for final value calculation
diff --git a/cpp/include/raft/distance/detail/correlation.cuh b/cpp/include/raft/distance/detail/correlation.cuh
index 2b77d280..c72248c8 100644
--- a/cpp/include/raft/distance/detail/correlation.cuh
+++ b/cpp/include/raft/distance/detail/correlation.cuh
@@ -98,13 +98,13 @@ static void correlationImpl(const DataT* x,
     if (gridStrideX == blockIdx.x * KPolicy::Nblk) {
       for (int i = threadIdx.x; i < KPolicy::Mblk; i += KPolicy::Nthreads) {
         auto idx   = gridStrideY + i;
-        sx2Norm[i] = idx < m ? x2n[idx] : 0;
+        sx2Norm[i] = idx < m ? x2n[idx] : DataT(0);
       }
     }
 
     for (int i = threadIdx.x; i < KPolicy::Nblk; i += KPolicy::Nthreads) {
       auto idx   = gridStrideX + i;
-      sy2Norm[i] = idx < n ? y2n[idx] : 0;
+      sy2Norm[i] = idx < n ? y2n[idx] : DataT(0);
     }
     __syncthreads();
 
@@ -121,11 +121,11 @@ static void correlationImpl(const DataT* x,
     for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
 #pragma unroll
       for (int j = 0; j < KPolicy::AccColsPerTh; ++j) {
-        auto numer   = k * acc[i][j] - (regxn[i] * regyn[j]);
-        auto Q_denom = k * regx2n[i] - (regxn[i] * regxn[i]);
-        auto R_denom = k * regy2n[j] - (regyn[j] * regyn[j]);
+        auto numer   = AccT(k) * acc[i][j] - (regxn[i] * regyn[j]);
+        auto Q_denom = AccT(k) * regx2n[i] - (regxn[i] * regxn[i]);
+        auto R_denom = AccT(k) * regy2n[j] - (regyn[j] * regyn[j]);
 
-        acc[i][j] = 1 - (numer / raft::mySqrt(Q_denom * R_denom));
+        acc[i][j] = AccT(1) - (numer / raft::mySqrt(Q_denom * R_denom));
       }
     }
   };
diff --git a/cpp/include/raft/distance/detail/hamming.cuh b/cpp/include/raft/distance/detail/hamming.cuh
index bed9d09e..5f65f099 100644
--- a/cpp/include/raft/distance/detail/hamming.cuh
+++ b/cpp/include/raft/distance/detail/hamming.cuh
@@ -82,7 +82,7 @@ static void hammingUnexpandedImpl(const DataT* x,
                                       DataT * regyn,
                                       IdxT gridStrideX,
                                       IdxT gridStrideY) {
-    const DataT one_over_k = DataT(1.0) / k;
+    const DataT one_over_k = DataT(1.0) / DataT(k);
 #pragma unroll
     for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
 #pragma unroll
diff --git a/cpp/include/raft/distance/detail/hellinger.cuh b/cpp/include/raft/distance/detail/hellinger.cuh
index 31854fd1..671289c0 100644
--- a/cpp/include/raft/distance/detail/hellinger.cuh
+++ b/cpp/include/raft/distance/detail/hellinger.cuh
@@ -106,9 +106,9 @@ static void hellingerImpl(const DataT* x,
 #pragma unroll
       for (int j = 0; j < KPolicy::AccColsPerTh; ++j) {
         // Adjust to replace NaN in sqrt with 0 if input to sqrt is negative
-        const auto finalVal  = (1 - acc[i][j]);
-        const auto rectifier = (!signbit(finalVal));
-        acc[i][j]            = raft::mySqrt(rectifier * finalVal);
+        const auto finalVal  = ((AccT)1 - acc[i][j]);
+        const auto rectifier = (!signbit((float)finalVal));
+        acc[i][j]            = raft::mySqrt((AccT)rectifier * finalVal);
       }
     }
   };
diff --git a/cpp/include/raft/distance/detail/jensen_shannon.cuh b/cpp/include/raft/distance/detail/jensen_shannon.cuh
index 92ee071c..e2041240 100644
--- a/cpp/include/raft/distance/detail/jensen_shannon.cuh
+++ b/cpp/include/raft/distance/detail/jensen_shannon.cuh
@@ -76,12 +76,12 @@ static void jensenShannonImpl(const DataT* x,
 
   // Accumulation operation lambda
   auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
-    const DataT m     = 0.5f * (x + y);
-    const bool m_zero = (m == 0);
-    const auto logM   = (!m_zero) * raft::myLog(m + m_zero);
+    const DataT m     = DataT(0.5f) * (x + y);
+    const bool m_zero = (m == DataT(0));
+    const auto logM   = DataT(!m_zero) * raft::myLog(m + DataT(m_zero));
 
-    const bool x_zero = (x == 0);
-    const bool y_zero = (y == 0);
+    const DataT x_zero = (x == DataT(0));
+    const DataT y_zero = (y == DataT(0));
     acc += (-x * (logM - raft::myLog(x + x_zero))) + (-y * (logM - raft::myLog(y + y_zero)));
   };
 
@@ -95,7 +95,7 @@ static void jensenShannonImpl(const DataT* x,
     for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
 #pragma unroll
       for (int j = 0; j < KPolicy::AccColsPerTh; ++j) {
-        acc[i][j] = raft::mySqrt(0.5 * acc[i][j]);
+        acc[i][j] = raft::mySqrt(AccT(0.5) * acc[i][j]);
       }
     }
   };
diff --git a/cpp/include/raft/distance/detail/kl_divergence.cuh b/cpp/include/raft/distance/detail/kl_divergence.cuh
index 4c0c4b6a..b24b6532 100644
--- a/cpp/include/raft/distance/detail/kl_divergence.cuh
+++ b/cpp/include/raft/distance/detail/kl_divergence.cuh
@@ -80,35 +80,35 @@ static void klDivergenceImpl(const DataT* x,
   // Accumulation operation lambda
   auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
     if (isRowMajor) {
-      const bool x_zero = (x == 0);
-      acc += x * (raft::myLog(x + x_zero) - y);
+      const bool x_zero = (x == DataT(0));
+      acc += x * (raft::myLog(x + DataT(x_zero)) - y);
     } else {
-      const bool y_zero = (y == 0);
-      acc += y * (raft::myLog(y + y_zero) - x);
+      const bool y_zero = (y == DataT(0));
+      acc += y * (raft::myLog(y + DataT(y_zero)) - x);
     }
   };
 
   auto core_lambda_x_equal_y = [] __device__(AccT & acc, DataT & x, DataT & y) {
     if (isRowMajor) {
-      const bool x_zero = (x == 0);
-      const bool y_zero = (y == 0);
-      acc += x * (raft::myLog(x + x_zero) - (!y_zero) * raft::myLog(y + y_zero));
+      const bool x_zero = (x == DataT(0));
+      const bool y_zero = (y == DataT(0));
+      acc += x * (raft::myLog(x + DataT(x_zero)) - DataT(!y_zero) * raft::myLog(y + DataT(y_zero)));
     } else {
-      const bool y_zero = (y == 0);
-      const bool x_zero = (x == 0);
-      acc += y * (raft::myLog(y + y_zero) - (!x_zero) * raft::myLog(x + x_zero));
+      const bool y_zero = (x == DataT(0));
+      const bool x_zero = (y == DataT(0));
+      acc += y * (raft::myLog(y + DataT(y_zero)) - DataT(!x_zero) * raft::myLog(x + DataT(x_zero)));
     }
   };
 
   auto unaryOp_lambda = [] __device__(DataT input) {
-    const bool x_zero = (input == 0);
-    return (!x_zero) * raft::myLog(input + x_zero);
+    const bool x_zero = (input == DataT(0));
+    return DataT(!x_zero) * raft::myLog(input + DataT(x_zero));
   };
 
   auto unaryOp_lambda_reverse = [] __device__(DataT input) {
     // reverse previous log (x) back to x using (e ^ log(x))
-    const bool x_zero = (input == 0);
-    return (!x_zero) * raft::myExp(input);
+    const bool x_zero = (input == DataT(0));
+    return DataT(!x_zero) * raft::myExp(input);
   };
 
   // epilogue operation lambda for final value calculation
@@ -121,7 +121,7 @@ static void klDivergenceImpl(const DataT* x,
     for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
 #pragma unroll
       for (int j = 0; j < KPolicy::AccColsPerTh; ++j) {
-        acc[i][j] = (0.5f * acc[i][j]);
+        acc[i][j] = (DataT(0.5f) * acc[i][j]);
       }
     }
   };
diff --git a/cpp/include/raft/distance/detail/minkowski.cuh b/cpp/include/raft/distance/detail/minkowski.cuh
index d3d0979d..cc84e49f 100644
--- a/cpp/include/raft/distance/detail/minkowski.cuh
+++ b/cpp/include/raft/distance/detail/minkowski.cuh
@@ -84,7 +84,7 @@ void minkowskiUnExpImpl(const DataT* x,
                                       DataT * regyn,
                                       IdxT gridStrideX,
                                       IdxT gridStrideY) {
-    const auto one_over_p = 1.0f / p;
+    const auto one_over_p = (DataT)1.0f / p;
 #pragma unroll
     for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
 #pragma unroll
diff --git a/cpp/include/raft/distance/detail/pairwise_distance_base.cuh b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
index 27e99353..371f2195 100644
--- a/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
@@ -180,7 +180,7 @@ struct PairwiseDistances : public BaseClass {
     for (int i = 0; i < P::AccRowsPerTh; ++i) {
 #pragma unroll
       for (int j = 0; j < P::AccColsPerTh; ++j) {
-        acc[i][j] = BaseClass::Zero;
+        acc[i][j] = 0; //BaseClass::Zero;
       }
     }
 
@@ -235,13 +235,13 @@ struct PairwiseDistances : public BaseClass {
       if (gridStrideX == blockIdx.x * P::Nblk) {
         for (int i = threadIdx.x; i < P::Mblk; i += P::Nthreads) {
           auto idx  = gridStrideY + i;
-          sxNorm[i] = idx < this->m ? xn[idx] : 0;
+          sxNorm[i] = idx < this->m ? xn[idx] : (DataT)0;
         }
       }
 
       for (int i = threadIdx.x; i < P::Nblk; i += P::Nthreads) {
         auto idx  = gridStrideX + i;
-        syNorm[i] = idx < this->n ? yn[idx] : 0;
+        syNorm[i] = idx < this->n ? yn[idx] : (DataT)0;
       }
 
       __syncthreads();
diff --git a/cpp/include/raft/distance/detail/russell_rao.cuh b/cpp/include/raft/distance/detail/russell_rao.cuh
index 5d516e78..e398cdab 100644
--- a/cpp/include/raft/distance/detail/russell_rao.cuh
+++ b/cpp/include/raft/distance/detail/russell_rao.cuh
@@ -76,7 +76,7 @@ static void russellRaoImpl(const DataT* x,
   // Accumulation operation lambda
   auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { acc += x * y; };
 
-  const float one_over_k = 1.0 / k;
+  const AccT one_over_k = AccT(1.0) / AccT(k);
   // epilogue operation lambda for final value calculation
   auto epilog_lambda = [k, one_over_k] __device__(
                          AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
@@ -88,7 +88,7 @@ static void russellRaoImpl(const DataT* x,
     for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
 #pragma unroll
       for (int j = 0; j < KPolicy::AccColsPerTh; ++j) {
-        acc[i][j] = (k - acc[i][j]) * one_over_k;
+        acc[i][j] = (AccT(k) - acc[i][j]) * one_over_k;
       }
     }
   };
diff --git a/cpp/include/raft/linalg/contractions.cuh b/cpp/include/raft/linalg/contractions.cuh
index 8aed0cb4..b3d2d2b6 100644
--- a/cpp/include/raft/linalg/contractions.cuh
+++ b/cpp/include/raft/linalg/contractions.cuh
@@ -154,6 +154,12 @@ template <typename DataT, int _veclen>
 struct Policy4x4 {
 };
 
+template <int _veclen>
+struct Policy4x4<half, _veclen> {
+  typedef KernelPolicy<half, _veclen, 64, 4, 4, 16, 16> Policy;
+  typedef ColKernelPolicy<half, _veclen, 64, 4, 4, 16, 16> ColPolicy;
+};
+
 template <int _veclen>
 struct Policy4x4<float, _veclen> {
   typedef KernelPolicy<float, _veclen, 32, 4, 4, 16, 16> Policy;
diff --git a/cpp/include/raft/linalg/detail/contractions.cuh b/cpp/include/raft/linalg/detail/contractions.cuh
index 5d83f88e..e0289d69 100644
--- a/cpp/include/raft/linalg/detail/contractions.cuh
+++ b/cpp/include/raft/linalg/detail/contractions.cuh
@@ -76,7 +76,7 @@ struct Contractions_NT {
   /** block of Y data loaded from global mem after `ldgXY()` */
   DataT ldgDataY[P::LdgPerThY][P::Veclen];
 
-  static constexpr DataT Zero = (DataT)0;
+  //static const/*expr*/ DataT Zero = (DataT)0;
 
  public:
   /**
@@ -199,7 +199,7 @@ struct Contractions_NT {
         } else {
 #pragma unroll
           for (int j = 0; j < P::Veclen; ++j) {
-            ldgDataX[i][j] = Zero;
+            ldgDataX[i][j] = 0;
           }
         }
       }
@@ -213,7 +213,7 @@ struct Contractions_NT {
         } else {
 #pragma unroll
           for (int j = 0; j < P::Veclen; ++j) {
-            ldgDataX[i][j] = Zero;
+            ldgDataX[i][j] = 0;
           }
         }
       }
@@ -232,7 +232,7 @@ struct Contractions_NT {
         } else {
 #pragma unroll
           for (int j = 0; j < P::Veclen; ++j) {
-            ldgDataY[i][j] = Zero;
+            ldgDataY[i][j] = 0;
           }
         }
       }
@@ -246,7 +246,7 @@ struct Contractions_NT {
         } else {
 #pragma unroll
           for (int j = 0; j < P::Veclen; ++j) {
-            ldgDataY[i][j] = Zero;
+            ldgDataY[i][j] = 0;
           }
         }
       }
diff --git a/cpp/include/raft/util/cuda_utils.cuh b/cpp/include/raft/util/cuda_utils.cuh
index 1d1c82eb..c7951df0 100644
--- a/cpp/include/raft/util/cuda_utils.cuh
+++ b/cpp/include/raft/util/cuda_utils.cuh
@@ -257,6 +257,17 @@ DI double myAtomicMax(double* address, double val)
 template <typename T>
 HDI T myMax(T x, T y);
 template <>
+DI half myMax<half>(half x, half y)
+{
+#if __CUDA_ARCH__ >= 800
+  return __hmax(x, y);
+#elif defined(__CUDA_ARCH__)
+  return __hgt(x,y) ? x : y;
+#else
+  return float(x) > float(y) ? x : y;
+#endif
+}
+template <>
 HDI float myMax<float>(float x, float y)
 {
   return fmaxf(x, y);
@@ -275,6 +286,17 @@ HDI double myMax<double>(double x, double y)
 template <typename T>
 HDI T myMin(T x, T y);
 template <>
+DI half myMin<half>(half x, half y)
+{
+#if __CUDA_ARCH__ >= 800
+  return __hmin(x, y);
+#elif defined(__CUDA_ARCH__)
+  return __hlt(x,y) ? x : y;
+#else
+  return float(x) < float(y) ? x : y;
+#endif
+}
+template <>
 HDI float myMin<float>(float x, float y)
 {
   return fminf(x, y);
@@ -330,6 +352,15 @@ HDI int sgn(const T val)
 template <typename T>
 HDI T myExp(T x);
 template <>
+DI half myExp(half x)
+{
+#ifdef __CUDA_ARCH__
+  return hexp(x);
+#else
+  return expf(float(x));
+#endif
+}
+template <>
 HDI float myExp(float x)
 {
   return expf(x);
@@ -366,6 +397,15 @@ inline __device__ double myInf<double>()
 template <typename T>
 HDI T myLog(T x);
 template <>
+DI half myLog(half x)
+{
+#ifdef __CUDA_ARCH__
+  return hlog(x);
+#else
+  return logf(float(x));
+#endif
+}
+template <>
 HDI float myLog(float x)
 {
   return logf(x);
@@ -384,6 +424,15 @@ HDI double myLog(double x)
 template <typename T>
 HDI T mySqrt(T x);
 template <>
+DI half mySqrt(half x)
+{
+#ifdef __CUDA_ARCH__
+  return hsqrt(x);
+#else
+  return sqrtf(x);
+#endif
+}
+template <>
 HDI float mySqrt(float x)
 {
   return sqrtf(x);
@@ -420,6 +469,15 @@ DI void mySinCos(double x, double& s, double& c)
 template <typename T>
 DI T mySin(T x);
 template <>
+DI half mySin(half x)
+{
+#ifdef __CUDA_ARCH__
+  return hsin(x);
+#else
+  return sinf(float(x));
+#endif
+}
+template <>
 DI float mySin(float x)
 {
   return sinf(x);
@@ -441,6 +499,15 @@ DI T myAbs(T x)
   return x < 0 ? -x : x;
 }
 template <>
+DI half myAbs(half x)
+{
+#ifdef __CUDA_ARCH__
+  return __habs(x);
+#else
+  return fabsf(float(x));
+#endif
+}
+template <>
 DI float myAbs(float x)
 {
   return fabsf(x);
@@ -459,6 +526,11 @@ DI double myAbs(double x)
 template <typename T>
 HDI T myPow(T x, T power);
 template <>
+HDI half myPow(half x, half power)
+{
+  return powf(x, power);
+}
+template <>
 HDI float myPow(float x, float power)
 {
   return powf(x, power);
diff --git a/cpp/include/raft/util/device_loads_stores.cuh b/cpp/include/raft/util/device_loads_stores.cuh
index 2b87c44d..5df27152 100644
--- a/cpp/include/raft/util/device_loads_stores.cuh
+++ b/cpp/include/raft/util/device_loads_stores.cuh
@@ -146,6 +146,57 @@ DI void sts(int32_t* addr, const int32_t (&x)[4])
                : "l"(s4), "r"(x[0]), "r"(x[1]), "r"(x[2]), "r"(x[3]));
 }
 
+DI void sts(__half* addr, const __half& x)
+{
+#ifdef USE_HALF_LS_ASM
+  auto s1 = __cvta_generic_to_shared(reinterpret_cast<float*>(addr));
+  asm volatile("st.shared.f32 [%0], {%1};" : : "l"(s1), "f"(x));
+#else
+  addr[0] = x;
+#endif
+}
+DI void sts(__half* addr, const __half (&x)[1])
+{
+#ifdef USE_HALF_LS_ASM
+  auto s1 = __cvta_generic_to_shared(reinterpret_cast<float*>(addr));
+  asm volatile("st.shared.f32 [%0], {%1};" : : "l"(s1), "f"(x[0]));
+#else
+  addr[0] = x[0];
+#endif
+}
+DI void sts(__half* addr, const __half (&x)[2])
+{
+#ifdef USE_HALF_LS_ASM
+  auto s2 = __cvta_generic_to_shared(reinterpret_cast<float2*>(addr));
+  asm volatile("st.shared.v2.f32 [%0], {%1, %2};" : : "l"(s2), "f"(x[0]), "f"(x[1]));
+#else
+  addr[0] = x[0]; addr[1] = x[1];
+#endif
+}
+DI void sts(__half* addr, const __half (&x)[4])
+{
+#ifdef USE_HALF_LS_ASM
+  auto s4 = __cvta_generic_to_shared(reinterpret_cast<float4*>(addr));
+  asm volatile("st.shared.v4.f32 [%0], {%1, %2, %3, %4};"
+               :
+               : "l"(s4), "f"(x[0]), "f"(x[1]), "f"(x[2]), "f"(x[3]));
+#else
+  addr[0] = x[0]; addr[1] = x[1]; addr[2] = x[2]; addr[3] = x[3];
+#endif
+}
+DI void sts(__half* addr, const __half (&x)[8])
+{
+#ifdef USE_HALF_LS_ASM
+  auto s4 = __cvta_generic_to_shared(reinterpret_cast<float4*>(addr));
+  asm volatile("st.shared.v4.f32 [%0], {%1, %2, %3, %4};"
+               :
+               : "l"(s4), "f"(x[0]), "f"(x[1]), "f"(x[2]), "f"(x[3]));
+#else
+  addr[0] = x[0]; addr[1] = x[1]; addr[2] = x[2]; addr[3] = x[3];
+  addr[4] = x[4]; addr[5] = x[5]; addr[6] = x[6]; addr[7] = x[7];
+#endif
+}
+
 DI void sts(float* addr, const float& x)
 {
   auto s1 = __cvta_generic_to_shared(reinterpret_cast<float*>(addr));
@@ -344,6 +395,57 @@ DI void lds(float (&x)[4], const float* addr)
                : "l"(s4));
 }
 
+DI void lds(__half& x, __half* addr)
+{
+#ifdef USE_HALF_LS_ASM
+  auto s1 = __cvta_generic_to_shared(reinterpret_cast<float*>(addr));
+  asm volatile("ld.shared.f32 {%0}, [%1];" : "=f"(x) : "l"(s1));
+#else
+  x = addr[0];
+#endif
+}
+DI void lds(__half (&x)[1], __half* addr)
+{
+#ifdef USE_HALF_LS_ASM
+  auto s1 = __cvta_generic_to_shared(reinterpret_cast<float*>(addr));
+  asm volatile("ld.shared.f32 {%0}, [%1];" : "=f"(x[0]) : "l"(s1));
+#else
+  x[0] = addr[0];
+#endif
+}
+DI void lds(__half (&x)[2], __half* addr)
+{
+#ifdef USE_HALF_LS_ASM
+  auto s2 = __cvta_generic_to_shared(reinterpret_cast<float2*>(addr));
+  asm volatile("ld.shared.v2.f32 {%0, %1}, [%2];" : "=f"(x[0]), "=f"(x[1]) : "l"(s2));
+#else
+  x[0] = addr[0]; x[1] = addr[1];
+#endif
+}
+DI void lds(__half (&x)[4], __half* addr)
+{
+#ifdef USE_HALF_LS_ASM
+  auto s4 = __cvta_generic_to_shared(reinterpret_cast<float4*>(addr));
+  asm volatile("ld.shared.v4.f32 {%0, %1, %2, %3}, [%4];"
+               : "=f"(x[0]), "=f"(x[1]), "=f"(x[2]), "=f"(x[3])
+               : "l"(s4));
+#else
+  x[0] = addr[0]; x[1] = addr[1]; x[2] = addr[2]; x[3] = addr[3];
+#endif
+}
+DI void lds(__half (&x)[8], __half* addr)
+{
+#ifdef USE_HALF_LS_ASM
+  auto s4 = __cvta_generic_to_shared(reinterpret_cast<float4*>(addr));
+  asm volatile("ld.shared.v4.f32 {%0, %1, %2, %3}, [%4];"
+               : "=f"(x[0]), "=f"(x[1]), "=f"(x[2]), "=f"(x[3])
+               : "l"(s4));
+#elseW
+  x[0] = addr[0]; x[1] = addr[1]; x[2] = addr[2]; x[3] = addr[3];
+  x[4] = addr[4]; x[5] = addr[5]; x[6] = addr[6]; x[7] = addr[7];
+#endif
+}
+
 DI void lds(float& x, float* addr)
 {
   auto s1 = __cvta_generic_to_shared(reinterpret_cast<float*>(addr));
@@ -390,6 +492,52 @@ DI void lds(double (&x)[2], double* addr)
  * @param[out] x    data to be loaded from global memory
  * @param[in]  addr address in global memory from where to load
  */
+DI void ldg(__half& x, const __half* addr)
+{
+#ifdef USE_HALF_LS_ASM
+  asm volatile("ld.global.cg.f16 %0, [%1];" : "=h"(x) : "l"(addr));
+#else
+  x = addr[0];
+#endif
+}
+DI void ldg(__half (&x)[1], const __half* addr)
+{
+#ifdef USE_HALF_LS_ASM
+  asm volatile("ld.global.cg.f16 %0, [%1];" : "=h"(x[0]) : "l"(addr));
+#else
+  x[0] = addr[0];
+#endif
+}
+DI void ldg(__half (&x)[2], const __half* addr)
+{
+#ifdef USE_HALF_LS_ASM
+  asm volatile("ld.global.cg.v2.f16 {%0, %1}, [%2];" : "=h"(x[0]), "=h"(x[1]) : "l"(addr));
+#else
+  x[0] = addr[0]; x[1] = addr[1];
+#endif
+}
+DI void ldg(__half (&x)[4], const __half* addr)
+{
+#ifdef USE_HALF_LS_ASM
+  asm volatile("ld.global.cg.v4.f16 {%0, %1, %2, %3}, [%4];"
+               : "=h"(x[0]), "=h"(x[1]), "=h"(x[2]), "=h"(x[3])
+               : "l"(addr));
+#else
+  x[0] = addr[0]; x[1] = addr[1]; x[2] = addr[2]; x[3] = addr[3];
+#endif
+}
+DI void ldg(__half (&x)[8], const __half* addr)
+{
+#ifdef USE_HALF_LS_ASM
+  asm volatile("ld.global.cg.v8.f16 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8];"
+               : "=h"(x[0]), "=h"(x[1]), "=h"(x[2]), "=h"(x[3]), "=h"(x[4]), "=h"(x[5]), "=h"(x[6]), "=h"(x[7])
+               : "l"(addr));
+#else
+  x[0] = addr[0]; x[1] = addr[1]; x[2] = addr[2]; x[3] = addr[3];
+  x[4] = addr[4]; x[5] = addr[5]; x[6] = addr[6]; x[7] = addr[7];
+#endif
+}
+
 DI void ldg(float& x, const float* addr)
 {
   asm volatile("ld.global.cg.f32 %0, [%1];" : "=f"(x) : "l"(addr));
diff --git a/cpp/include/raft_distance/pairwise_distance.hpp b/cpp/include/raft_distance/pairwise_distance.hpp
index e91ef5de..0373d3ef 100644
--- a/cpp/include/raft_distance/pairwise_distance.hpp
+++ b/cpp/include/raft_distance/pairwise_distance.hpp
@@ -17,6 +17,18 @@
 #include <raft/distance/distance_types.hpp>
 
 namespace raft::distance::runtime {
+	
+void pairwise_distance(raft::handle_t const& handle,
+                       unsigned short* x,
+                       unsigned short* y,
+                       unsigned short* dists,
+                       int m,
+                       int n,
+                       int k,
+                       raft::distance::DistanceType metric,
+                       bool isRowMajor,
+                       float metric_arg);
+				   
 void pairwise_distance(raft::handle_t const& handle,
                        float* x,
                        float* y,
diff --git a/cpp/src/distance/pairwise_distance.cu b/cpp/src/distance/pairwise_distance.cu
index 71133c5f..583cbad8 100644
--- a/cpp/src/distance/pairwise_distance.cu
+++ b/cpp/src/distance/pairwise_distance.cu
@@ -21,6 +21,36 @@
 
 namespace raft::distance::runtime {
 
+void pairwise_distance(raft::handle_t const& handle,
+                       unsigned short* x,
+                       unsigned short* y,
+                       unsigned short* dists,
+                       int m,
+                       int n,
+                       int k,
+                       raft::distance::DistanceType metric,
+                       bool isRowMajor,
+                       float metric_arg)
+{
+  raft::distance::pairwise_distance<half, int>(
+    handle, (half*)x, (half*)y, (half*)dists, m, n, k, metric, isRowMajor, metric_arg);
+}
+
+void pairwise_distance(raft::handle_t const& handle,
+                       half* x,
+                       half* y,
+                       half* dists,
+                       int m,
+                       int n,
+                       int k,
+                       raft::distance::DistanceType metric,
+                       bool isRowMajor,
+                       float metric_arg)
+{
+  raft::distance::pairwise_distance<half, int>(
+    handle, x, y, dists, m, n, k, metric, isRowMajor, metric_arg);
+}
+
 void pairwise_distance(raft::handle_t const& handle,
                        float* x,
                        float* y,
diff --git a/python/pylibraft/pylibraft/distance/pairwise_distance.pyx b/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
index 8d55402e..f3be29c8 100644
--- a/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
+++ b/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
@@ -37,6 +37,17 @@ def is_c_cont(cai, dt):
 cdef extern from "raft_distance/pairwise_distance.hpp" \
         namespace "raft::distance::runtime":
 
+    cdef void pairwise_distance(const handle_t &handle,
+                                unsigned short *x,
+                                unsigned short *y,
+                                unsigned short *dists,
+                                int m,
+                                int n,
+                                int k,
+                                DistanceType metric,
+                                bool isRowMajor,
+                                float metric_arg)
+						  
     cdef void pairwise_distance(const handle_t &handle,
                                 float *x,
                                 float *y,
@@ -188,5 +199,16 @@ def distance(X, Y, dists, metric="euclidean", p=2.0):
                           <DistanceType>distance_type,
                           <bool>x_c_contiguous,
                           <float>p)
+    elif x_dt == np.float16:
+        pairwise_distance(deref(h),
+                          <unsigned short*> x_ptr,
+                          <unsigned short*> y_ptr,
+                          <unsigned short*> d_ptr,
+                          <int>m,
+                          <int>n,
+                          <int>x_k,
+                          <DistanceType>distance_type,
+                          <bool>x_c_contiguous,
+                          <float>p)
     else:
         raise ValueError("dtype %s not supported" % x_dt)