Skip to content

Commit

Permalink
Better random seeding, random filling, bug fixes and new documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
Pencilcaseman committed Jul 16, 2023
1 parent fee63c9 commit 7d740bc
Show file tree
Hide file tree
Showing 16 changed files with 621 additions and 26 deletions.
1 change: 1 addition & 0 deletions librapid/include/librapid/array/array.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include "arrayView.hpp"
#include "arrayViewString.hpp"
#include "arrayFromData.hpp"
#include "fill.hpp"
#include "pseudoConstructors.hpp"
#include "fourierTransform.hpp"

Expand Down
171 changes: 171 additions & 0 deletions librapid/include/librapid/array/fill.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
#ifndef LIBRAPID_ARRAY_FILL_HPP
#define LIBRAPID_ARRAY_FILL_HPP

namespace librapid {
template<typename ShapeType, typename StorageType, typename Scalar>
LIBRAPID_ALWAYS_INLINE void fill(array::ArrayContainer<ShapeType, StorageType> &dst,
const Scalar &value) {
dst = array::ArrayContainer<ShapeType, StorageType>(dst.shape(), value);
}

template<typename ShapeType, typename StorageScalar, typename Lower, typename Upper>
LIBRAPID_ALWAYS_INLINE void
fillRandom(array::ArrayContainer<ShapeType, Storage<StorageScalar>> &dst, const Lower &lower,
const Upper &upper) {
ShapeType shape = dst.shape();
auto *data = dst.storage().begin();
bool parallel = global::numThreads != 1 && shape.size() > global::multithreadThreshold;

if (parallel) {
#pragma omp parallel for
for (int64_t i = 0; i < shape.size(); ++i) {
data[i] = random<StorageScalar>(lower, upper);
}
} else {
for (int64_t i = 0; i < shape.size(); ++i) {
data[i] = random<StorageScalar>(lower, upper);
}
}
}

template<typename ShapeType, typename StorageScalar, typename Lower, typename Upper>
LIBRAPID_ALWAYS_INLINE void
fillRandomGaussian(array::ArrayContainer<ShapeType, Storage<StorageScalar>> &dst,
const Lower &lower, const Upper &upper) {
ShapeType shape = dst.shape();
auto *data = dst.storage().begin();
bool parallel = global::numThreads != 1 && shape.size() > global::multithreadThreshold;

if (parallel) {
#pragma omp parallel for
for (int64_t i = 0; i < shape.size(); ++i) {
data[i] = randomGaussian<StorageScalar>();
}
} else {
for (int64_t i = 0; i < shape.size(); ++i) {
data[i] = randomGaussian<StorageScalar>();
}
}
}

#if defined(LIBRAPID_HAS_OPENCL)

template<typename ShapeType, typename StorageScalar, typename Lower, typename Upper>
LIBRAPID_ALWAYS_INLINE void
fillRandom(array::ArrayContainer<ShapeType, OpenCLStorage<StorageScalar>> &dst,
const Lower &lower, const Upper &upper) {
ShapeType shape = dst.shape();
int64_t elements = shape.size();

// Initialize a buffer of random seeds
static int64_t numSeeds = 1024;
static bool initialized = false;
static Array<int64_t, backend::OpenCL> seeds(Shape {numSeeds});
if (global::reseed || !initialized) {
for (int64_t i = 0; i < numSeeds; ++i) { seeds(i) = randint(0, INT64_MAX); }
initialized = true;

// reseed is controlled by the random module, so we don't need to worry about it here
}

// Run the kernel
opencl::runLinearKernel<StorageScalar, true>("fillRandom",
elements,
dst.storage().data(),
elements,
static_cast<StorageScalar>(lower),
static_cast<StorageScalar>(upper),
seeds.storage().data(),
numSeeds);
}

#endif // LIBRAPID_HAS_OPENCL

#if defined(LIBRAPID_HAS_CUDA)

template<typename ShapeType, typename StorageScalar, typename Lower, typename Upper>
LIBRAPID_ALWAYS_INLINE void
fillRandom(array::ArrayContainer<ShapeType, CudaStorage<StorageScalar>> &dst,
const Lower &lower, const Upper &upper) {
ShapeType shape = dst.shape();
int64_t elements = shape.size();

// Initialize a buffer of random seeds
static int64_t numSeeds = 1024;
static bool initialized = false;
static Array<int64_t, backend::CUDA> seeds(Shape {numSeeds});

if (global::reseed || !initialized) {
for (int64_t i = 0; i < numSeeds; ++i) { seeds(i) = randint(0, INT64_MAX); }
initialized = true;

// reseed is controlled by the random module, so we don't need to worry about it here
}

cuda::runKernel<StorageScalar, Lower, Upper>("fill",
"fillRandom",
elements,
dst.storage().data().get(),
elements,
static_cast<StorageScalar>(lower),
static_cast<StorageScalar>(upper),
seeds.storage().data().get(),
numSeeds);
}

template<typename ShapeType, typename Lower, typename Upper>
LIBRAPID_ALWAYS_INLINE void
fillRandom(array::ArrayContainer<ShapeType, CudaStorage<float>> &dst, const Lower &lower,
const Upper &upper) {
ShapeType shape = dst.shape();
int64_t elements = shape.size();

// Create a pseudo-random number generator
static curandGenerator_t prng;
static bool initialized = false;

if (!initialized) {
curandCreateGenerator(&prng, CURAND_RNG_PSEUDO_DEFAULT);
curandSetPseudoRandomGeneratorSeed(prng, global::randomSeed);
initialized = true;
}

if (global::reseed) { curandSetPseudoRandomGeneratorSeed(prng, global::randomSeed); }

// Run the kernel
curandGenerateUniform(prng, dst.storage().data().get(), elements);

// Scale the result to the desired range
dst = dst * (upper - lower) + lower;
}

template<typename ShapeType, typename Lower, typename Upper>
LIBRAPID_ALWAYS_INLINE void
fillRandom(array::ArrayContainer<ShapeType, CudaStorage<double>> &dst, const Lower &lower,
const Upper &upper) {
ShapeType shape = dst.shape();
int64_t elements = shape.size();

// Create a pseudo-random number generator
static curandGenerator_t prng;
static bool initialized = false;

if (!initialized) {
curandCreateGenerator(&prng, CURAND_RNG_PSEUDO_DEFAULT);
curandSetPseudoRandomGeneratorSeed(prng, global::randomSeed);
initialized = true;
}

if (global::reseed) { curandSetPseudoRandomGeneratorSeed(prng, global::randomSeed); }

// Run the kernel
curandGenerateUniformDouble(prng, dst.storage().data().get(), elements);

// Scale the result to the desired range
dst = dst * (upper - lower) + lower;
}

#endif // LIBRAPID_HAS_CUDA
} // namespace librapid

#endif // !LIBRAPID_ARRAY_FILL_HPP
5 changes: 2 additions & 3 deletions librapid/include/librapid/array/function.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -171,8 +171,8 @@ namespace librapid {
/// \return The result of the function (scalar).
LIBRAPID_NODISCARD LIBRAPID_ALWAYS_INLINE Scalar scalar(size_t index) const;

Iterator begin() const;
Iterator end() const;
LIBRAPID_NODISCARD LIBRAPID_ALWAYS_INLINE Iterator begin() const;
LIBRAPID_NODISCARD LIBRAPID_ALWAYS_INLINE Iterator end() const;

/// Return a string representation of the Function
/// \param format The format to use.
Expand Down Expand Up @@ -208,7 +208,6 @@ namespace librapid {

template<typename desc, typename Functor, typename... Args>
auto Function<desc, Functor, Args...>::shape() const {
// return std::get<0>(m_args).shape();
return typetraits::TypeInfo<Functor>::getShape(m_args);
}

Expand Down
3 changes: 3 additions & 0 deletions librapid/include/librapid/array/linalg/arrayMultiply.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -336,6 +336,9 @@ namespace librapid {
return {1};
}
}

LIBRAPID_NOT_IMPLEMENTED;
return {1};
}

template<typename ShapeTypeA, typename StorageTypeA, typename ShapeTypeB,
Expand Down
14 changes: 10 additions & 4 deletions librapid/include/librapid/array/linalg/level3/gemm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@
namespace librapid::linalg {
/// \brief General matrix-matrix multiplication
///
/// Computes \f$ \mathbf{C} = \alpha \mathrm{OP}_A(\mathbf{A}) \mathrm{OP}_B(\mathbf{B}) + \beta \mathbf{C} \f$
/// for matrices \f$ \mathbf{A} \f$, \f$ \mathbf{B} \f$ and \f$ \mathbf{C} \f$. \f$ \mathrm{OP}_A \f$ and \f$ \mathrm{OP}_B \f$ are
/// Computes \f$ \mathbf{C} = \alpha \mathrm{OP}_A(\mathbf{A}) \mathrm{OP}_B(\mathbf{B}) +
/// \beta \mathbf{C} \f$
/// for matrices \f$ \mathbf{A} \f$, \f$ \mathbf{B} \f$ and \f$ \mathbf{C} \f$.
/// \f$ \mathrm{OP}_A \f$ and \f$ \mathrm{OP}_B \f$ are
/// either the identity or the transpose operation.
/// \tparam Int Integer type for matrix dimensions
/// \tparam Alpha Type of \f$ \alpha \f$
Expand Down Expand Up @@ -268,8 +270,12 @@ namespace librapid::linalg {
} else {
// If the provided types are not supported by cuBLAS, use the custom fallback kernel

jitify::Program program = global::jitCache.program(cuda::loadKernel(
fmt::format("{}/include/librapid/array/linalg/level3/gemm", LIBRAPID_SOURCE), false));
jitify::Program program = global::jitCache.program(
cuda::loadKernel(
fmt::format("{}/include/librapid/array/linalg/level3/gemm", LIBRAPID_SOURCE),
false),
{},
{fmt::format("-I{}", CUDA_INCLUDE_DIRS)});

size_t TS = 32;

Expand Down
10 changes: 5 additions & 5 deletions librapid/include/librapid/array/linalg/transpose.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -325,7 +325,7 @@ namespace librapid {
}
} // namespace opencl

#endif // LIBRAPID_HAS_OPENCL
#endif // LIBRAPID_HAS_OPENCL

#if defined(LIBRAPID_HAS_CUDA)
namespace cuda {
Expand All @@ -344,14 +344,14 @@ namespace librapid {
cublasSafeCall(cublasSgeam(global::cublasHandle,
CUBLAS_OP_T,
CUBLAS_OP_N,
rows,
cols,
(int)rows,
(int)cols,
&alpha,
in,
cols,
(int)cols,
&zero,
in,
cols,
(int)cols,
out,
rows));
}
Expand Down
9 changes: 9 additions & 0 deletions librapid/include/librapid/core/global.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,12 @@ namespace librapid {
// Number of threads used by LibRapid
extern size_t numThreads;

// Random seed used by LibRapid (when changed, the random number generator is reseeded)
extern size_t randomSeed;

// Should the random number generator be reseeded?
extern bool reseed;

// Size of a cache line in bytes
extern size_t cacheLineSize;

Expand Down Expand Up @@ -74,6 +80,9 @@ namespace librapid {

void setNumThreads(size_t numThreads);
size_t getNumThreads();

void setSeed(size_t seed);
size_t getSeed();
} // namespace librapid

#endif // LIBRAPID_CORE_GLOBAL_HPP
12 changes: 12 additions & 0 deletions librapid/include/librapid/cuda/kernels/activations.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,17 @@
template<typename Destination, typename Data>
__global__ void sigmoidActivationForward(size_t elements, Destination *dst, Data *src) {
const size_t kernelIndex = blockDim.x * blockIdx.x + threadIdx.x;
if (kernelIndex < elements) { dst[kernelIndex] = 1 / (1 + exp((float)-src[kernelIndex])); }
}

template<>
__global__ void sigmoidActivationForward(size_t elements, float *dst, float *src) {
const size_t kernelIndex = blockDim.x * blockIdx.x + threadIdx.x;
if (kernelIndex < elements) { dst[kernelIndex] = 1 / (1 + exp(-src[kernelIndex])); }
}

template<>
__global__ void sigmoidActivationForward(size_t elements, double *dst, double *src) {
const size_t kernelIndex = blockDim.x * blockIdx.x + threadIdx.x;
if (kernelIndex < elements) { dst[kernelIndex] = 1 / (1 + exp(-src[kernelIndex])); }
}
Expand Down
Loading

0 comments on commit 7d740bc

Please sign in to comment.