Better random seeding, random filling, bug fixes and new documentation

LibRapid · Jul 16, 2023 · 7d740bc · 7d740bc
1 parent fee63c9
commit 7d740bc
Show file tree

Hide file tree

Showing 16 changed files with 621 additions and 26 deletions.
diff --git a/librapid/include/librapid/array/array.hpp b/librapid/include/librapid/array/array.hpp
@@ -23,6 +23,7 @@
 #include "arrayView.hpp"
 #include "arrayViewString.hpp"
 #include "arrayFromData.hpp"
+#include "fill.hpp"
 #include "pseudoConstructors.hpp"
 #include "fourierTransform.hpp"
 

diff --git a/librapid/include/librapid/array/fill.hpp b/librapid/include/librapid/array/fill.hpp
@@ -0,0 +1,171 @@
+#ifndef LIBRAPID_ARRAY_FILL_HPP
+#define LIBRAPID_ARRAY_FILL_HPP
+
+namespace librapid {
+ template<typename ShapeType, typename StorageType, typename Scalar>
+ LIBRAPID_ALWAYS_INLINE void fill(array::ArrayContainer<ShapeType, StorageType> &dst,
+ const Scalar &value) {
+ dst = array::ArrayContainer<ShapeType, StorageType>(dst.shape(), value);
+ }
+
+ template<typename ShapeType, typename StorageScalar, typename Lower, typename Upper>
+ LIBRAPID_ALWAYS_INLINE void
+ fillRandom(array::ArrayContainer<ShapeType, Storage<StorageScalar>> &dst, const Lower &lower,
+ const Upper &upper) {
+ ShapeType shape = dst.shape();
+ auto *data = dst.storage().begin();
+ bool parallel = global::numThreads != 1 && shape.size() > global::multithreadThreshold;
+
+ if (parallel) {
+#pragma omp parallel for
+ for (int64_t i = 0; i < shape.size(); ++i) {
+ data[i] = random<StorageScalar>(lower, upper);
+ }
+ } else {
+ for (int64_t i = 0; i < shape.size(); ++i) {
+ data[i] = random<StorageScalar>(lower, upper);
+ }
+ }
+ }
+
+ template<typename ShapeType, typename StorageScalar, typename Lower, typename Upper>
+ LIBRAPID_ALWAYS_INLINE void
+ fillRandomGaussian(array::ArrayContainer<ShapeType, Storage<StorageScalar>> &dst,
+ const Lower &lower, const Upper &upper) {
+ ShapeType shape = dst.shape();
+ auto *data = dst.storage().begin();
+ bool parallel = global::numThreads != 1 && shape.size() > global::multithreadThreshold;
+
+ if (parallel) {
+#pragma omp parallel for
+ for (int64_t i = 0; i < shape.size(); ++i) {
+ data[i] = randomGaussian<StorageScalar>();
+ }
+ } else {
+ for (int64_t i = 0; i < shape.size(); ++i) {
+ data[i] = randomGaussian<StorageScalar>();
+ }
+ }
+ }
+
+#if defined(LIBRAPID_HAS_OPENCL)
+
+ template<typename ShapeType, typename StorageScalar, typename Lower, typename Upper>
+ LIBRAPID_ALWAYS_INLINE void
+ fillRandom(array::ArrayContainer<ShapeType, OpenCLStorage<StorageScalar>> &dst,
+ const Lower &lower, const Upper &upper) {
+ ShapeType shape = dst.shape();
+ int64_t elements = shape.size();
+
+ // Initialize a buffer of random seeds
+ static int64_t numSeeds = 1024;
+ static bool initialized = false;
+ static Array<int64_t, backend::OpenCL> seeds(Shape {numSeeds});
+ if (global::reseed || !initialized) {
+ for (int64_t i = 0; i < numSeeds; ++i) { seeds(i) = randint(0, INT64_MAX); }
+ initialized = true;
+
+ // reseed is controlled by the random module, so we don't need to worry about it here
+ }
+
+ // Run the kernel
+ opencl::runLinearKernel<StorageScalar, true>("fillRandom",
+ elements,
+ dst.storage().data(),
+ elements,
+ static_cast<StorageScalar>(lower),
+ static_cast<StorageScalar>(upper),
+ seeds.storage().data(),
+ numSeeds);
+ }
+
+#endif // LIBRAPID_HAS_OPENCL
+
+#if defined(LIBRAPID_HAS_CUDA)
+
+ template<typename ShapeType, typename StorageScalar, typename Lower, typename Upper>
+ LIBRAPID_ALWAYS_INLINE void
+ fillRandom(array::ArrayContainer<ShapeType, CudaStorage<StorageScalar>> &dst,
+ const Lower &lower, const Upper &upper) {
+ ShapeType shape = dst.shape();
+ int64_t elements = shape.size();
+
+ // Initialize a buffer of random seeds
+ static int64_t numSeeds = 1024;
+ static bool initialized = false;
+ static Array<int64_t, backend::CUDA> seeds(Shape {numSeeds});
+
+ if (global::reseed || !initialized) {
+ for (int64_t i = 0; i < numSeeds; ++i) { seeds(i) = randint(0, INT64_MAX); }
+ initialized = true;
+
+ // reseed is controlled by the random module, so we don't need to worry about it here
+ }
+
+ cuda::runKernel<StorageScalar, Lower, Upper>("fill",
+ "fillRandom",
+ elements,
+ dst.storage().data().get(),
+ elements,
+ static_cast<StorageScalar>(lower),
+ static_cast<StorageScalar>(upper),
+ seeds.storage().data().get(),
+ numSeeds);
+ }
+
+ template<typename ShapeType, typename Lower, typename Upper>
+ LIBRAPID_ALWAYS_INLINE void
+ fillRandom(array::ArrayContainer<ShapeType, CudaStorage<float>> &dst, const Lower &lower,
+ const Upper &upper) {
+ ShapeType shape = dst.shape();
+ int64_t elements = shape.size();
+
+ // Create a pseudo-random number generator
+ static curandGenerator_t prng;
+ static bool initialized = false;
+
+ if (!initialized) {
+ curandCreateGenerator(&prng, CURAND_RNG_PSEUDO_DEFAULT);
+ curandSetPseudoRandomGeneratorSeed(prng, global::randomSeed);
+ initialized = true;
+ }
+
+ if (global::reseed) { curandSetPseudoRandomGeneratorSeed(prng, global::randomSeed); }
+
+ // Run the kernel
+ curandGenerateUniform(prng, dst.storage().data().get(), elements);
+
+ // Scale the result to the desired range
+ dst = dst * (upper - lower) + lower;
+ }
+
+ template<typename ShapeType, typename Lower, typename Upper>
+ LIBRAPID_ALWAYS_INLINE void
+ fillRandom(array::ArrayContainer<ShapeType, CudaStorage<double>> &dst, const Lower &lower,
+ const Upper &upper) {
+ ShapeType shape = dst.shape();
+ int64_t elements = shape.size();
+
+ // Create a pseudo-random number generator
+ static curandGenerator_t prng;
+ static bool initialized = false;
+
+ if (!initialized) {
+ curandCreateGenerator(&prng, CURAND_RNG_PSEUDO_DEFAULT);
+ curandSetPseudoRandomGeneratorSeed(prng, global::randomSeed);
+ initialized = true;
+ }
+
+ if (global::reseed) { curandSetPseudoRandomGeneratorSeed(prng, global::randomSeed); }
+
+ // Run the kernel
+ curandGenerateUniformDouble(prng, dst.storage().data().get(), elements);
+
+ // Scale the result to the desired range
+ dst = dst * (upper - lower) + lower;
+ }
+
+#endif // LIBRAPID_HAS_CUDA
+} // namespace librapid
+
+#endif // !LIBRAPID_ARRAY_FILL_HPP
diff --git a/librapid/include/librapid/array/function.hpp b/librapid/include/librapid/array/function.hpp
@@ -171,8 +171,8 @@ namespace librapid {
  /// \return The result of the function (scalar).
  LIBRAPID_NODISCARD LIBRAPID_ALWAYS_INLINE Scalar scalar(size_t index) const;
 
- Iterator begin() const;
- Iterator end() const;
+ LIBRAPID_NODISCARD LIBRAPID_ALWAYS_INLINE Iterator begin() const;
+ LIBRAPID_NODISCARD LIBRAPID_ALWAYS_INLINE Iterator end() const;
 
  /// Return a string representation of the Function
  /// \param format The format to use.
@@ -208,7 +208,6 @@ namespace librapid {
 
  template<typename desc, typename Functor, typename... Args>
  auto Function<desc, Functor, Args...>::shape() const {
- // return std::get<0>(m_args).shape();
  return typetraits::TypeInfo<Functor>::getShape(m_args);
  }
 

diff --git a/librapid/include/librapid/array/linalg/arrayMultiply.hpp b/librapid/include/librapid/array/linalg/arrayMultiply.hpp
@@ -336,6 +336,9 @@ namespace librapid {
  return {1};
  }
  }
+
+ LIBRAPID_NOT_IMPLEMENTED;
+ return {1};
  }
 
  template<typename ShapeTypeA, typename StorageTypeA, typename ShapeTypeB,

diff --git a/librapid/include/librapid/array/linalg/level3/gemm.hpp b/librapid/include/librapid/array/linalg/level3/gemm.hpp
@@ -4,8 +4,10 @@
 namespace librapid::linalg {
  /// \brief General matrix-matrix multiplication
  ///
- /// Computes \f$ \mathbf{C} = \alpha \mathrm{OP}_A(\mathbf{A}) \mathrm{OP}_B(\mathbf{B}) + \beta \mathbf{C} \f$
- /// for matrices \f$ \mathbf{A} \f$, \f$ \mathbf{B} \f$ and \f$ \mathbf{C} \f$. \f$ \mathrm{OP}_A \f$ and \f$ \mathrm{OP}_B \f$ are
+ /// Computes \f$ \mathbf{C} = \alpha \mathrm{OP}_A(\mathbf{A}) \mathrm{OP}_B(\mathbf{B}) +
+ /// \beta \mathbf{C} \f$
+ /// for matrices \f$ \mathbf{A} \f$, \f$ \mathbf{B} \f$ and \f$ \mathbf{C} \f$.
+ /// \f$ \mathrm{OP}_A \f$ and \f$ \mathrm{OP}_B \f$ are
  /// either the identity or the transpose operation.
  /// \tparam Int Integer type for matrix dimensions
  /// \tparam Alpha Type of \f$ \alpha \f$
@@ -268,8 +270,12 @@ namespace librapid::linalg {
  } else {
  // If the provided types are not supported by cuBLAS, use the custom fallback kernel
 
- jitify::Program program = global::jitCache.program(cuda::loadKernel(
- fmt::format("{}/include/librapid/array/linalg/level3/gemm", LIBRAPID_SOURCE), false));
+ jitify::Program program = global::jitCache.program(
+ cuda::loadKernel(
+ fmt::format("{}/include/librapid/array/linalg/level3/gemm", LIBRAPID_SOURCE),
+ false),
+ {},
+ {fmt::format("-I{}", CUDA_INCLUDE_DIRS)});
 
  size_t TS = 32;
 

diff --git a/librapid/include/librapid/array/linalg/transpose.hpp b/librapid/include/librapid/array/linalg/transpose.hpp
@@ -325,7 +325,7 @@ namespace librapid {
  }
  } // namespace opencl
 
-#endif  // LIBRAPID_HAS_OPENCL
+#endif // LIBRAPID_HAS_OPENCL
 
 #if defined(LIBRAPID_HAS_CUDA)
  namespace cuda {
@@ -344,14 +344,14 @@ namespace librapid {
  cublasSafeCall(cublasSgeam(global::cublasHandle,
  CUBLAS_OP_T,
  CUBLAS_OP_N,
- rows,
- cols,
+ (int)rows,
+ (int)cols,
  &alpha,
  in,
- cols,
+ (int)cols,
  &zero,
  in,
- cols,
+ (int)cols,
  out,
  rows));
  }

diff --git a/librapid/include/librapid/core/global.hpp b/librapid/include/librapid/core/global.hpp
@@ -23,6 +23,12 @@ namespace librapid {
  // Number of threads used by LibRapid
  extern size_t numThreads;
 
+ // Random seed used by LibRapid (when changed, the random number generator is reseeded)
+ extern size_t randomSeed;
+
+ // Should the random number generator be reseeded?
+ extern bool reseed;
+
  // Size of a cache line in bytes
  extern size_t cacheLineSize;
 
@@ -74,6 +80,9 @@ namespace librapid {
 
  void setNumThreads(size_t numThreads);
  size_t getNumThreads();
+
+ void setSeed(size_t seed);
+ size_t getSeed();
 } // namespace librapid
 
 #endif // LIBRAPID_CORE_GLOBAL_HPP
diff --git a/librapid/include/librapid/cuda/kernels/activations.cu b/librapid/include/librapid/cuda/kernels/activations.cu
@@ -1,5 +1,17 @@
 template<typename Destination, typename Data>
 __global__ void sigmoidActivationForward(size_t elements, Destination *dst, Data *src) {
+ const size_t kernelIndex = blockDim.x * blockIdx.x + threadIdx.x;
+ if (kernelIndex < elements) { dst[kernelIndex] = 1 / (1 + exp((float)-src[kernelIndex])); }
+}
+
+template<>
+__global__ void sigmoidActivationForward(size_t elements, float *dst, float *src) {
+ const size_t kernelIndex = blockDim.x * blockIdx.x + threadIdx.x;
+ if (kernelIndex < elements) { dst[kernelIndex] = 1 / (1 + exp(-src[kernelIndex])); }
+}
+
+template<>
+__global__ void sigmoidActivationForward(size_t elements, double *dst, double *src) {
  const size_t kernelIndex = blockDim.x * blockIdx.x + threadIdx.x;
  if (kernelIndex < elements) { dst[kernelIndex] = 1 / (1 + exp(-src[kernelIndex])); }
 }