Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added a variable sized SoA macro definition. #4

Open
wants to merge 5 commits into
base: clue_porting_to_cmssw
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
321 changes: 321 additions & 0 deletions CUDADataFormats/Common/interface/SoAmacros.h

Large diffs are not rendered by default.

8 changes: 7 additions & 1 deletion CUDADataFormats/Common/test/BuildFile.xml
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,11 @@
<use name="catch2"/>
<use name="cuda"/>
</bin>


<bin name="testCUDADataFormatsCommon"
file="testRunner.cc, testSoA.cc testSoAGPU.cu">
<use name="HeterogeneousCore/CUDACore"/>
<use name="cuda"/>
<use name="cppunit"/>
</bin>
</iftool>
1 change: 1 addition & 0 deletions CUDADataFormats/Common/test/testRunner.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
#include "Utilities/Testing/interface/CppUnit_testdriver.icpp"
153 changes: 153 additions & 0 deletions CUDADataFormats/Common/test/testSoA.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
#include <iostream>
#include <stdint.h>
#include <stdlib.h>
#include <stdexcept>
#include <stdlib.h>
#include "cppunit/extensions/HelperMacros.h"

/*
* Unit tests skeleton based on DataFormats/Common/test/ (testDataFormatsCommon)
*/

#include "CUDADataFormats/Common/interface/SoAmacros.h"

class testSoA: public CppUnit::TestFixture {
CPPUNIT_TEST_SUITE(testSoA);
CPPUNIT_TEST(initialTest);
CPPUNIT_TEST(checkAlignment);
CPPUNIT_TEST_SUITE_END();

public:
void setUp() {}
void tearDown() {}
void initialTest();
void checkAlignment();
};


CPPUNIT_TEST_SUITE_REGISTRATION(testSoA);


#define check(X) \
do { std::cout << #X " is " << (X) << std::endl; } while(false)

// declare a statically-sized SoA, templated on the column size and (optional) alignment

declare_SoA_template(SoA,
// predefined static scalars
// size_t size;
// size_t alignment;

// columns: one value per element
SoA_column(double, x),
SoA_column(double, y),
SoA_column(double, z),
SoA_column(uint16_t, colour),
SoA_column(int32_t, value),
SoA_column(const char *, name),

// scalars: one value for the whole structure
SoA_scalar(const char *, description)
);

void testSoA::initialTest() {
std::cout << std::boolalpha;



SoA::dump(10, 8);
// CPPUNIT_ASSERT_EQUAL_MESSAGE("Size of SoA<1>",
// 3 * sizeof(double)
// + (sizeof(uint16_t) / sizeof(int32_t) + 1) * sizeof(int32_t) // Take into account the padding to align the next element
// + sizeof(int32_t)
// + 2 * sizeof(const char *),
// sizeof(SoA<1>));
//check(sizeof(SoA<1>));
std::cout << std::endl;

SoA::dump(10);
SoA::dump(31);
SoA::dump(32);
std::cout << std::endl;

SoA::dump(1, 64);
SoA::dump(10, 64);
SoA::dump(31, 64);
SoA::dump(32, 64);
std::cout << std::endl;

SoA soa(new std::byte[SoA::computeDataSize(10,32)], 10, 32);
check(& soa.z()[7] == & (soa[7].z));

soa[7].x = 0.;
soa[7].y = 3.1416;
soa[7].z = -1.;
soa[7].colour = 42;
soa[7].value = 9999;
soa[7].name = "element";

soa[9] = soa[7];

CPPUNIT_ASSERT(& soa.z()[7] == & (soa[7].z));
}

template <typename T>
void checkValuesAlignment(SoA &soa, T SoA::element::*member, const std::string & memberName, uint32_t bitAlignment) {
if (bitAlignment % 8) CPPUNIT_FAIL("bitAlignment not byte aligned.");
size_t byteAlignment = bitAlignment / 8;
for (size_t i=0; i<soa.nElements(); i++) {
// Check that each value is aligned
if (reinterpret_cast<std::uintptr_t>(&(soa[i].*member)) % byteAlignment
!= (i * T::valueSize) %byteAlignment ) {
std::stringstream err;
err << "Misaligned value: " << memberName << " at index=" << i
<< " address=" << &(soa[i].*member) << " byteAlignment=" << byteAlignment
<< " address lower part: " << reinterpret_cast<std::uintptr_t>(&(soa[i].*member)) % byteAlignment
<< " expected address lower part: " << ((i * T::valueSize) % byteAlignment)
<< " size=" << soa.nElements() << " align=" << soa.byteAlignment();
CPPUNIT_FAIL(err.str());
}
// Check that all values except the first-in rows (address 0 modulo alignment)
// are contiguous to their predecessors in memory (this will detect cutting
// memory/cache/etc... lines in unexpected places (for blocked SoA like AoSoA)
if ((reinterpret_cast<std::uintptr_t>(&(soa[i].*member)) % byteAlignment)
&& (reinterpret_cast<std::uintptr_t>(&(soa[i - 1].*member)) + T::valueSize
!= reinterpret_cast<std::uintptr_t>(&(soa[i].*member)))) {
std::stringstream err;
err << "Unexpected non-contiguity: " << memberName << " at index=" << i
<< " address=" << &(soa[i].*member) << " is not contiguous to "
<< memberName << " at index=" << i - 1 << "address=" << &(soa[i - 1].*member)
<< " size=" << soa.nElements() << " align=" << soa.byteAlignment() << " valueSize=" << T::valueSize;
CPPUNIT_FAIL(err.str());
}
}
}

void checkSoAAlignment(size_t nElements, size_t byteAlignment, uint32_t bitAlignment) {
// Allocate a block which is aligned (we need to pass free as the custom free function (to replace delete).
auto malloc_deleter = [](void * p){free(p);};
std::unique_ptr<std::byte, decltype(malloc_deleter)> dataBlock(
reinterpret_cast<std::byte*>(aligned_alloc(byteAlignment, SoA::computeDataSize(nElements,byteAlignment))),
malloc_deleter
);
SoA soa(dataBlock.get(), nElements, byteAlignment);
checkValuesAlignment(soa, &SoA::element::x, "x", bitAlignment);
checkValuesAlignment(soa, &SoA::element::y, "y", bitAlignment);
checkValuesAlignment(soa, &SoA::element::z, "z", bitAlignment);
checkValuesAlignment(soa, &SoA::element::colour, "colour", bitAlignment);
checkValuesAlignment(soa, &SoA::element::value, "value", bitAlignment);
checkValuesAlignment(soa, &SoA::element::name, "name", bitAlignment);
}

void testSoA::checkAlignment() {
checkSoAAlignment(1, 1, 8);
checkSoAAlignment(10, 1, 8);
checkSoAAlignment(31, 1, 8);
checkSoAAlignment(32, 1, 8);

checkSoAAlignment(1,64, 8*64);
checkSoAAlignment(10,64, 8*64);
checkSoAAlignment(31,64, 8*64);
checkSoAAlignment(32,64, 8*64);
}

138 changes: 138 additions & 0 deletions CUDADataFormats/Common/test/testSoAGPU.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
#include "cppunit/extensions/HelperMacros.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"

/*
* Unit tests for SoA running on GPU
*/

#include "CUDADataFormats/Common/interface/SoAmacros.h"

#include <cuda_runtime_api.h>

class testSoAGPU: public CppUnit::TestFixture {
CPPUNIT_TEST_SUITE(testSoAGPU);
CPPUNIT_TEST(runInGPU);
CPPUNIT_TEST_SUITE_END();

public:
void setUp() {}
void tearDown() {}
void runInGPU();

// declare a statically-sized SoA, templated on the column size and (optional) alignment
declare_SoA_template(SoA,
// predefined static scalars
// size_t size;
// size_t alignment;

// columns: one value per element
SoA_column(double, x),
SoA_column(double, y),
SoA_column(double, z),
SoA_column(uint16_t, colour),
SoA_column(int32_t, value),
SoA_column(double *, py),

// scalars: one value for the whole structure
SoA_scalar(const char *, description)
);

// declare equivalent struct
struct AoSelement {
double x;
double y;
double z;
uint16_t colour;
int32_t value;
double * py;
};
private:
static constexpr int defaultDevice = 0;
static constexpr size_t elementsCount = 100;

// Check we find what we wanted to initialize.
// Pass should be initialized to true.
__host__ __device__ static void checkSoAelement(SoA soa, size_t i, bool * pass) {
if (i >= soa.nElements() || !*pass) return;
if (soa[i].x != 11.0 * i) { *pass = false; return; }
if (soa[i].y != 22.0 * i) { *pass = false; return; }
if (soa[i].z != 33.0 * i) { *pass = false; return; }
if (soa[i].colour != i) { *pass = false; return; }
if (soa[i].value != static_cast<int32_t>(0x10001 * i)) { *pass = false; return; }
}

};

CPPUNIT_TEST_SUITE_REGISTRATION(testSoAGPU);

#define CUDA_UNIT_CHECK(A) CPPUNIT_ASSERT_NO_THROW(cudaCheck(A))

namespace {
// Fill up the elements of the SoA
__global__ void fillSoA(testSoAGPU::SoA soa) {
size_t i = blockIdx.x * blockDim.x + threadIdx.x;
if (i >= soa.nElements()) return;
soa[i].x = 11.0 * i;
soa[i].y = 22.0 * i;
soa[i].z = 33.0 * i;
soa[i].colour = i;
soa[i].value = 0x10001 * i;
soa[i].py = &soa[i].y;
}

void hexdump(void *ptr, int buflen) {
/* From https://stackoverflow.com/a/29865 with adaptations */
unsigned char *buf = (unsigned char*)ptr;
int i, j;
for (i=0; i<buflen; i+=16) {
printf("%06x: ", i);
for (j=0; j<16; j++) {
if (i+j < buflen)
printf("%02x ", buf[i+j]);
else
printf(" ");
if ((i+j) % 4 == 3) printf (" ");
}
printf(" ");
// for (j=0; j<16; j++)
// if (i+j < buflen)
// printf("%c", isprint(buf[i+j]) ? buf[i+j] : '.');
printf("\n");
}
}
}

void testSoAGPU::runInGPU() {
// Get device, stream, memory
cudaDeviceProp deviceProperties;
CPPUNIT_ASSERT(cms::cudatest::testDevices());
CUDA_UNIT_CHECK(cudaGetDeviceProperties(&deviceProperties, defaultDevice));
cudaStream_t stream;
CUDA_UNIT_CHECK(cudaStreamCreate(&stream));

// Allocate memory and populate SoA descriptors
auto deviceSoABlock = cms::cuda::make_device_unique<std::byte[]>(SoA::computeDataSize(elementsCount), stream);
auto hostSoABlock = cms::cuda::make_host_unique<std::byte[]>(SoA::computeDataSize(elementsCount), stream);
SoA deviceSoA(deviceSoABlock.get(), elementsCount);
SoA hostSoA(hostSoABlock.get(), elementsCount);

// Call kernel, get result
fillSoA<<<(elementsCount - 1)/deviceProperties.warpSize + 1, deviceProperties.warpSize, 0, stream>>>(deviceSoA);
CUDA_UNIT_CHECK(cudaMemcpyAsync(hostSoABlock.get(), deviceSoABlock.get(), SoA::computeDataSize(hostSoA.nElements()), cudaMemcpyDeviceToHost, stream));
CUDA_UNIT_CHECK(cudaStreamSynchronize(stream));

// Validate result
bool pass = true;
size_t i = 0;
for (; pass && i< hostSoA.nElements(); i++) checkSoAelement(hostSoA, i, &pass);
if (!pass) {
std::cout << "In " << typeid(*this).name() << " check failed at i= " << i << ")" << std::endl;
hexdump(hostSoABlock.get(), SoA::computeDataSize(hostSoA.nElements()));
printf("base=%p, &y=%p\n", deviceSoABlock.get(), deviceSoA.y());
}
CPPUNIT_ASSERT(pass);
}
16 changes: 0 additions & 16 deletions CUDADataFormats/HGCal/interface/ConstHGCCLUESoA.h

This file was deleted.

35 changes: 6 additions & 29 deletions CUDADataFormats/HGCal/interface/HGCCLUECPUProduct.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,14 @@
#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"

#include "CUDADataFormats/HGCal/interface/HGCCLUESoA.h"
#include "CUDADataFormats/HGCal/interface/ConstHGCCLUESoA.h"
#include "CUDADataFormats/HGCal/interface/HGCUncalibRecHitSoA.h"

class HGCCLUECPUProduct {
public:
HGCCLUECPUProduct() = default;
explicit HGCCLUECPUProduct(uint32_t nhits, const cudaStream_t &stream) : nhits_(nhits) {
size_tot_ = std::accumulate(sizes_.begin(), sizes_.end(), 0);
pad_ = ((nhits - 1) / 32 + 1) * 32; //align to warp boundary (assumption: warpSize = 32)
mMemCLUEHost = cms::cuda::make_host_unique<std::byte[]>(pad_ * size_tot_, stream);
mMemCLUEHost = cms::cuda::make_host_unique<std::byte[]>(
HGCCLUESoADescriptor::computeDataSize(nhits), stream);
}
~HGCCLUECPUProduct() = default;

Expand All @@ -25,44 +23,23 @@ class HGCCLUECPUProduct {
HGCCLUECPUProduct &operator=(HGCCLUECPUProduct &&) = default;

HGCCLUESoA get() {
HGCCLUESoA soa;
soa.rho = reinterpret_cast<float *>(mMemCLUEHost.get());
soa.delta = soa.rho + pad_;
soa.nearestHigher = reinterpret_cast<int32_t *>(soa.delta + pad_);
soa.clusterIndex = soa.nearestHigher + pad_;
soa.isSeed = reinterpret_cast<bool *>(soa.clusterIndex + pad_);
soa.nbytes = size_tot_;
HGCCLUESoA soa(mMemCLUEHost.get(), nhits_);
soa.nhits = nhits_;
soa.pad = pad_;
return soa;
}

ConstHGCCLUESoA get() const {
ConstHGCCLUESoA soa;
soa.rho = reinterpret_cast<float const*>(mMemCLUEHost.get());
soa.delta = soa.rho + pad_;
soa.nearestHigher = reinterpret_cast<int32_t const*>(soa.delta + pad_);
soa.clusterIndex = soa.nearestHigher + pad_;
soa.isSeed = reinterpret_cast<bool const*>(soa.clusterIndex + pad_);
const HGCCLUESoA get() const {
HGCCLUESoA soa(mMemCLUEHost.get(), nhits_);
soa.nhits = nhits_;
return soa;
}

//number of hits stored in the SoA
uint32_t nHits() const { return nhits_; }
//pad of memory block (used for warp alignment, slighlty larger than 'nhits_')
uint32_t pad() const { return pad_; }
//number of bytes of the SoA
uint32_t nBytes() const { return size_tot_; }

private:
cms::cuda::host::unique_ptr<std::byte[]> mMemCLUEHost;
static constexpr std::array<uint32_t, memory::npointers::ntypes_hgcclue_soa> sizes_ = {
{memory::npointers::float_hgcclue_soa * sizeof(float),
memory::npointers::int32_hgcclue_soa * sizeof(uint32_t),
memory::npointers::bool_hgcclue_soa * sizeof(bool)}};
uint32_t pad_;
uint32_t nhits_;
uint32_t size_tot_;
};

#endif //CUDADAtaFormats_HGCal_HGCCLUECPUProduct_H
Loading