From 0366d4ba0628c863a67c38541e0f59b9841b9933 Mon Sep 17 00:00:00 2001 From: Constance Wang Date: Mon, 12 Sep 2022 17:24:42 -0400 Subject: [PATCH 01/17] Update README.md --- README.md | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 0e38ddb..ad1c0bd 100644 --- a/README.md +++ b/README.md @@ -3,9 +3,13 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +Constance Wang + * [LinkedIn](https://www.linkedin.com/in/conswang/) + +Tested on AORUS 15P XD laptop with specs: +- Windows 11 22000.856 +- 11th Gen Intel(R) Core(TM) i7-11800H @ 2.30GHz 2.30 GHz +- NVIDIA GeForce RTX 3070 Laptop GPU ### (TODO: Your README) From a8cfb893a2f309c2362075abf3a9811270d15363 Mon Sep 17 00:00:00 2001 From: Constance Wang Date: Fri, 16 Sep 2022 12:55:42 -0400 Subject: [PATCH 02/17] add cpu implementations --- stream_compaction/cpu.cu | 55 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 53 insertions(+), 2 deletions(-) diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa11..cf65471 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -12,6 +12,17 @@ namespace StreamCompaction { return timer; } + void exclusiveScan(int n, int* odata, const int* idata) { + if (n <= 0) { + printf("Empty array!"); + return; + } + odata[0] = 0; // identity + for (int i = 0; i < n - 1; ++i) { + odata[i + 1] = odata[i] + idata[i]; + } + } + /** * CPU scan (prefix sum). * For performance analysis, this is supposed to be a simple for loop. @@ -20,6 +31,7 @@ namespace StreamCompaction { void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + exclusiveScan(n, odata, idata); timer().endCpuTimer(); } @@ -31,8 +43,15 @@ namespace StreamCompaction { int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + int outputIndex = 0; + for (int inputIndex = 0; inputIndex < n; ++inputIndex) { + if (idata[inputIndex] != 0) { + odata[outputIndex] = idata[inputIndex]; + outputIndex += 1; + } + } timer().endCpuTimer(); - return -1; + return outputIndex; } /** @@ -43,8 +62,40 @@ namespace StreamCompaction { int compactWithScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + // calculate bool. array + int* boolData = new int[n]; + if (boolData == NULL) { + printf("Failed to allocate boolData"); + return -1; + } + for (int i = 0; i < n; ++i) { + boolData[i] = (idata[i] == 0) ? 0 : 1; + } + + int* boolScan = new int[n]; + if (boolScan == NULL) { + printf("Failed to allocate boolScan"); + return -1; + } + exclusiveScan(n, boolScan, boolData); + //printf("boolScan "); + //for (int i = 0; i < n; ++i) { + // printf("%d ", boolScan[i]); + //} + + int outIndex = -1; + // Scatter + for (int i = 0; i < n; ++i) { + if (boolData[i] == 1) { + outIndex = boolScan[i]; + odata[outIndex] = idata[i]; + } + } + delete[] boolData; + delete[] boolScan; timer().endCpuTimer(); - return -1; + + return outIndex + 1; } } } From dd07ee2470b88eefde0856d12e0b1c633b7f9be3 Mon Sep 17 00:00:00 2001 From: Constance Wang Date: Fri, 16 Sep 2022 15:53:36 -0400 Subject: [PATCH 03/17] working naive scan - inclusive, only for pow of 2 --- stream_compaction/naive.cu | 54 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 4308876..57a006b 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -3,6 +3,12 @@ #include "common.h" #include "naive.h" +#include +#include + +#define blockSize 128 +#define checkCUDAErrorWithLine(msg) checkCUDAError(msg, __LINE__) + namespace StreamCompaction { namespace Naive { using StreamCompaction::Common::PerformanceTimer; @@ -11,7 +17,23 @@ namespace StreamCompaction { static PerformanceTimer timer; return timer; } + int* dev_idata; + int* dev_odata; + // TODO: __global__ + __global__ void kernScan(int n, int offset, int* odata, const int* idata) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + + if (index >= offset) { + odata[index] = idata[index - offset] + idata[index]; + } + else { + odata[index] = idata[index]; + } + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. @@ -19,6 +41,38 @@ namespace StreamCompaction { void scan(int n, int *odata, const int *idata) { timer().startGpuTimer(); // TODO + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc failed"); + + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAErrorWithLine("memcpy idata failed!"); + + dim3 fullBlocksPerGrid((n - blockSize + 1) / blockSize); + + int logOfN = ilog2ceil(n); + + for (int d = 1; d <= logOfN; ++d) { + int offset = pow(2, d - 1); + kernScan << < fullBlocksPerGrid, blockSize >> > (n, offset, dev_odata, dev_idata); + + // Needed to make sure we don't launch next iteration while prev is running + cudaDeviceSynchronize(); + + std::swap(dev_odata, dev_idata); + } + + // Now result buffer is in dev_idata + cudaMemcpy(odata, dev_idata, n * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAErrorWithLine("memcpy back to odata failed"); + + //printf("mewo mewomoemomemeow"); + //for (int i = 0; i < n; ++i) { + // printf("%d ", odata[i]); + //} + + cudaFree(dev_idata); + cudaFree(dev_odata); timer().endGpuTimer(); } } From 84b3fac5e9593531950e176ddc356205a451be1c Mon Sep 17 00:00:00 2001 From: Constance Wang Date: Fri, 16 Sep 2022 21:34:27 -0400 Subject: [PATCH 04/17] Working naive scan for non-powers of 2 --- src/main.cpp | 4 +-- stream_compaction/naive.cu | 61 ++++++++++++++++++++++++++++++-------- 2 files changed, 51 insertions(+), 14 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 896ac2b..6ec45c8 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,8 +13,8 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array -const int NPOT = SIZE - 3; // Non-Power-Of-Two +const int SIZE = 1 << 8; // feel free to change the size of array = 256 +const int NPOT = SIZE - 3; // Non-Power-Of-Two = 253 int *a = new int[SIZE]; int *b = new int[SIZE]; int *c = new int[SIZE]; diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 57a006b..fae4e00 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -5,6 +5,7 @@ #include #include +#include #define blockSize 128 #define checkCUDAErrorWithLine(msg) checkCUDAError(msg, __LINE__) @@ -19,6 +20,7 @@ namespace StreamCompaction { } int* dev_idata; int* dev_odata; + int* dev_unpadded_idata; // TODO: __global__ __global__ void kernScan(int n, int offset, int* odata, const int* idata) { @@ -35,26 +37,64 @@ namespace StreamCompaction { } } + // Pad the data with 1 zero at the beginning + // And enough zeroes at the end + // odata should be buffer of size paddedLength = the next power of two after and including (n + 1) + __global__ void kernShiftAndPadInput(int n, int paddedLength, int* odata, int* idata) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index == 0) { + odata[index] = 0; + } + else if (index <= n) { + odata[index] = idata[index - 1]; + } + else if (index < paddedLength) { + odata[index] = 0; + } + else { + return; + } + } + + void printCudaArray(int n, int* dev_array) { + int* tempArray = (int*) malloc(n * sizeof(int)); + cudaMemcpy(tempArray, dev_array, n * sizeof(int), cudaMemcpyDeviceToHost); + printf("Print array -----------\n"); + for (int i = 0; i < n; ++i) { + printf("%d ", tempArray[i]); + } + free(tempArray); + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { timer().startGpuTimer(); // TODO - cudaMalloc((void**)&dev_idata, n * sizeof(int)); - cudaMalloc((void**)&dev_odata, n * sizeof(int)); + + // Add 1 because we're going to offset by a zero for exclusive scan + int exponent = ilog2ceil(n + 1); + int paddedLength = pow(2, exponent); + // Input and output should be padded by 1s and 0s + cudaMalloc((void**)&dev_unpadded_idata, n * sizeof(int)); + cudaMalloc((void**)&dev_idata, paddedLength * sizeof(int)); + cudaMalloc((void**)&dev_odata, paddedLength * sizeof(int)); checkCUDAErrorWithLine("cudaMalloc failed"); - cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(dev_unpadded_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); checkCUDAErrorWithLine("memcpy idata failed!"); - dim3 fullBlocksPerGrid((n - blockSize + 1) / blockSize); + dim3 fullBlocksPerGrid((paddedLength + blockSize - 1) / blockSize); + + kernShiftAndPadInput<<>> + (n, paddedLength, dev_idata, dev_unpadded_idata); - int logOfN = ilog2ceil(n); + //printCudaArray(paddedLength, dev_idata); - for (int d = 1; d <= logOfN; ++d) { + for (int d = 1; d <= exponent; ++d) { int offset = pow(2, d - 1); - kernScan << < fullBlocksPerGrid, blockSize >> > (n, offset, dev_odata, dev_idata); + kernScan << < fullBlocksPerGrid, blockSize >> > (paddedLength, offset, dev_odata, dev_idata); // Needed to make sure we don't launch next iteration while prev is running cudaDeviceSynchronize(); @@ -63,16 +103,13 @@ namespace StreamCompaction { } // Now result buffer is in dev_idata + // We only need the first n elements of the total paddedLength elements cudaMemcpy(odata, dev_idata, n * sizeof(int), cudaMemcpyDeviceToHost); checkCUDAErrorWithLine("memcpy back to odata failed"); - //printf("mewo mewomoemomemeow"); - //for (int i = 0; i < n; ++i) { - // printf("%d ", odata[i]); - //} - cudaFree(dev_idata); cudaFree(dev_odata); + cudaFree(dev_unpadded_idata); timer().endGpuTimer(); } } From d1b181e49b65dc9850777d0c3e46910cc1900ccb Mon Sep 17 00:00:00 2001 From: Constance Wang Date: Fri, 16 Sep 2022 22:51:12 -0400 Subject: [PATCH 05/17] zero padding working without shift --- stream_compaction/common.cu | 1 - stream_compaction/common.h | 10 +++++++++ stream_compaction/efficient.cu | 38 ++++++++++++++++++++++++++++++++++ stream_compaction/naive.cu | 10 --------- 4 files changed, 48 insertions(+), 11 deletions(-) diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d63..58bb3b1 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -17,7 +17,6 @@ void checkCUDAErrorFn(const char *msg, const char *file, int line) { namespace StreamCompaction { namespace Common { - /** * Maps an array to an array of 0s and 1s for stream compaction. Elements * which map to 0 will be removed, and elements which map to 1 will be kept. diff --git a/stream_compaction/common.h b/stream_compaction/common.h index d2c1fed..5405eeb 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -30,6 +30,16 @@ inline int ilog2ceil(int x) { return x == 1 ? 0 : ilog2(x - 1) + 1; } +inline void printCudaArray(int n, int* dev_array) { + int* tempArray = (int*)malloc(n * sizeof(int)); + cudaMemcpy(tempArray, dev_array, n * sizeof(int), cudaMemcpyDeviceToHost); + printf("Print array -----------\n"); + for (int i = 0; i < n; ++i) { + printf("%d ", tempArray[i]); + } + free(tempArray); +} + namespace StreamCompaction { namespace Common { __global__ void kernMapToBoolean(int n, int *bools, const int *idata); diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346e..07de68c 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -3,6 +3,8 @@ #include "common.h" #include "efficient.h" +#define blockSize 128 + namespace StreamCompaction { namespace Efficient { using StreamCompaction::Common::PerformanceTimer; @@ -12,12 +14,48 @@ namespace StreamCompaction { return timer; } + // unlike naive impl, this one doesn't shift the array + __global__ void kernPadArray(int n, int paddedLen, int* odata, int* idata) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index < n) { + odata[index] = idata[index]; + } + else if (index < paddedLen) { + odata[index] = 0; + } + } + + int* dev_unpadded_idata; + int* dev_idata; + int* dev_odata; + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { timer().startGpuTimer(); // TODO + // Pad array + int exponent = ilog2ceil(n); + int paddedLength = pow(2, exponent); + dim3 fullBlocksPerGrid((paddedLength + blockSize - 1) / blockSize); + + cudaMalloc((void**)&dev_unpadded_idata, n * sizeof(int)); + cudaMalloc((void**)&dev_idata, paddedLength * sizeof(int)); + cudaMalloc((void**)&dev_odata, paddedLength * sizeof(int)); + + cudaMemcpy(dev_unpadded_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAErrorFn("Cuda memcpy idata no work"); + + kernPadArray << > > (n, paddedLength, dev_idata, dev_unpadded_idata); + printCudaArray(paddedLength, dev_idata); + + // Build tree + + cudaFree(dev_unpadded_idata); + cudaFree(dev_idata); + cudaFree(dev_odata); + timer().endGpuTimer(); } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index fae4e00..873aba8 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -56,16 +56,6 @@ namespace StreamCompaction { } } - void printCudaArray(int n, int* dev_array) { - int* tempArray = (int*) malloc(n * sizeof(int)); - cudaMemcpy(tempArray, dev_array, n * sizeof(int), cudaMemcpyDeviceToHost); - printf("Print array -----------\n"); - for (int i = 0; i < n; ++i) { - printf("%d ", tempArray[i]); - } - free(tempArray); - } - /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ From 7f786cf3ff88ac5adfa3133a2a43652001a18db4 Mon Sep 17 00:00:00 2001 From: Constance Wang Date: Sat, 17 Sep 2022 19:52:59 -0400 Subject: [PATCH 06/17] working upsweep --- stream_compaction/efficient.cu | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 07de68c..7ec0686 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -15,7 +15,7 @@ namespace StreamCompaction { } // unlike naive impl, this one doesn't shift the array - __global__ void kernPadArray(int n, int paddedLen, int* odata, int* idata) { + __global__ void kernPadArray(int n, int paddedLen, int* odata, const int* idata) { int index = threadIdx.x + (blockIdx.x * blockDim.x); if (index < n) { odata[index] = idata[index]; @@ -25,6 +25,21 @@ namespace StreamCompaction { } } + __global__ void kernUpsweep(int numThreads, int readStride, int* data) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= numThreads) { + return; + } + + int writeStride = readStride * 2; + + // Index of what element to write to is calculated using write stride + int writeIndex = (writeStride * index) + writeStride - 1; + int readIndex = (writeStride * index) + readStride - 1; + + data[writeIndex] += data[readIndex]; + } + int* dev_unpadded_idata; int* dev_idata; int* dev_odata; @@ -42,19 +57,30 @@ namespace StreamCompaction { cudaMalloc((void**)&dev_unpadded_idata, n * sizeof(int)); cudaMalloc((void**)&dev_idata, paddedLength * sizeof(int)); - cudaMalloc((void**)&dev_odata, paddedLength * sizeof(int)); cudaMemcpy(dev_unpadded_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); checkCUDAErrorFn("Cuda memcpy idata no work"); kernPadArray << > > (n, paddedLength, dev_idata, dev_unpadded_idata); + + // Build tree (upsweep) + // readStride = 2^depth, where depth goes from 0... log2n - 1... the stride between elements we read and sum + // writeStride = 2^(depth + 1)... the stride between indices of elements we store the sums in + for (int readStride = 1; readStride < paddedLength; readStride *= 2) { + int writeStride = readStride * 2; + + int numThreads = paddedLength / writeStride; // one thread per element we write to + dim3 numBlocks((paddedLength + blockSize - 1) / blockSize); + + kernUpsweep << > > (numThreads, readStride, dev_idata); + } + printCudaArray(paddedLength, dev_idata); - // Build tree + cudaMemcpy(odata, dev_idata, n * sizeof(int), cudaMemcpyDeviceToHost); cudaFree(dev_unpadded_idata); cudaFree(dev_idata); - cudaFree(dev_odata); timer().endGpuTimer(); } From 2449d0db6ccbcec8034a3d8eb081c8182a6e43a3 Mon Sep 17 00:00:00 2001 From: Constance Wang Date: Sat, 17 Sep 2022 20:41:38 -0400 Subject: [PATCH 07/17] working work efficient scan --- stream_compaction/efficient.cu | 38 ++++++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 7ec0686..61dd030 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -40,6 +40,22 @@ namespace StreamCompaction { data[writeIndex] += data[readIndex]; } + __global__ void kernDownsweep(int numThreads, int writeStride, int* data) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= numThreads) { + return; + } + + int readStride = writeStride * 2; + + int leftChildIndex = index * readStride + writeStride - 1; + int rightChildIndex = index * readStride + readStride - 1; // right child is also where parent is stored + + int temp = data[leftChildIndex]; + data[leftChildIndex] = data[rightChildIndex]; + data[rightChildIndex] += temp; + } + int* dev_unpadded_idata; int* dev_idata; int* dev_odata; @@ -70,12 +86,30 @@ namespace StreamCompaction { int writeStride = readStride * 2; int numThreads = paddedLength / writeStride; // one thread per element we write to - dim3 numBlocks((paddedLength + blockSize - 1) / blockSize); + dim3 numBlocks((numThreads + blockSize - 1) / blockSize); kernUpsweep << > > (numThreads, readStride, dev_idata); + cudaDeviceSynchronize(); } - printCudaArray(paddedLength, dev_idata); + // Down sweep + // In down sweep, children now read info from parent. So writeStride = readStride / 2 + // Write stride = n/2, n/4, ... 4, 2, 1, aka. 2^depth + // Read stride = 2^(depth + 1) + + // First set parent to 0 + int zero = 0; + cudaMemcpy(dev_idata + paddedLength - 1, &zero, sizeof(int), cudaMemcpyHostToDevice); + + for (int writeStride = paddedLength / 2; writeStride >= 1; writeStride = writeStride >> 1) { + int readStride = writeStride * 2; + + // now launch 1 thread per element we read from + int numThreads = paddedLength / readStride; + dim3 numBlocks((numThreads + blockSize - 1) / blockSize); + + kernDownsweep << > > (numThreads, writeStride, dev_idata); + } cudaMemcpy(odata, dev_idata, n * sizeof(int), cudaMemcpyDeviceToHost); From dacca18ade85a5a5cb200fb895e65d8d53336502 Mon Sep 17 00:00:00 2001 From: Constance Wang Date: Sat, 17 Sep 2022 21:29:41 -0400 Subject: [PATCH 08/17] gpu stream compaction appears to work --- stream_compaction/efficient.cu | 140 ++++++++++++++++++++++++--------- 1 file changed, 104 insertions(+), 36 deletions(-) diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 61dd030..4fabf61 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -56,9 +56,39 @@ namespace StreamCompaction { data[rightChildIndex] += temp; } - int* dev_unpadded_idata; - int* dev_idata; - int* dev_odata; + void scanImpl(int paddedLength, int* dev_idata) { + // Build tree (upsweep) + // readStride = 2^depth, where depth goes from 0... log2n - 1... the stride between elements we read and sum + // writeStride = 2^(depth + 1)... the stride between indices of elements we store the sums in + for (int readStride = 1; readStride < paddedLength; readStride *= 2) { + int writeStride = readStride * 2; + + int numThreads = paddedLength / writeStride; // one thread per element we write to + dim3 numBlocks((numThreads + blockSize - 1) / blockSize); + + kernUpsweep << > > (numThreads, readStride, dev_idata); + cudaDeviceSynchronize(); + } + + // Down sweep + // In down sweep, children now read info from parent. So writeStride = readStride / 2 + // Write stride = n/2, n/4, ... 4, 2, 1, aka. 2^depth + // Read stride = 2^(depth + 1) + + // First set parent to 0 + int zero = 0; + cudaMemcpy(dev_idata + paddedLength - 1, &zero, sizeof(int), cudaMemcpyHostToDevice); + + for (int writeStride = paddedLength / 2; writeStride >= 1; writeStride = writeStride >> 1) { + int readStride = writeStride * 2; + + // now launch 1 thread per element we read from + int numThreads = paddedLength / readStride; + dim3 numBlocks((numThreads + blockSize - 1) / blockSize); + + kernDownsweep << > > (numThreads, writeStride, dev_idata); + } + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. @@ -67,6 +97,9 @@ namespace StreamCompaction { timer().startGpuTimer(); // TODO // Pad array + int* dev_unpadded_idata; + int* dev_idata; + int exponent = ilog2ceil(n); int paddedLength = pow(2, exponent); dim3 fullBlocksPerGrid((paddedLength + blockSize - 1) / blockSize); @@ -79,38 +112,7 @@ namespace StreamCompaction { kernPadArray << > > (n, paddedLength, dev_idata, dev_unpadded_idata); - // Build tree (upsweep) - // readStride = 2^depth, where depth goes from 0... log2n - 1... the stride between elements we read and sum - // writeStride = 2^(depth + 1)... the stride between indices of elements we store the sums in - for (int readStride = 1; readStride < paddedLength; readStride *= 2) { - int writeStride = readStride * 2; - - int numThreads = paddedLength / writeStride; // one thread per element we write to - dim3 numBlocks((numThreads + blockSize - 1) / blockSize); - - kernUpsweep << > > (numThreads, readStride, dev_idata); - cudaDeviceSynchronize(); - } - - // Down sweep - // In down sweep, children now read info from parent. So writeStride = readStride / 2 - // Write stride = n/2, n/4, ... 4, 2, 1, aka. 2^depth - // Read stride = 2^(depth + 1) - - // First set parent to 0 - int zero = 0; - cudaMemcpy(dev_idata + paddedLength - 1, &zero, sizeof(int), cudaMemcpyHostToDevice); - - for (int writeStride = paddedLength / 2; writeStride >= 1; writeStride = writeStride >> 1) { - int readStride = writeStride * 2; - - // now launch 1 thread per element we read from - int numThreads = paddedLength / readStride; - dim3 numBlocks((numThreads + blockSize - 1) / blockSize); - - kernDownsweep << > > (numThreads, writeStride, dev_idata); - } - + scanImpl(paddedLength, dev_idata); cudaMemcpy(odata, dev_idata, n * sizeof(int), cudaMemcpyDeviceToHost); cudaFree(dev_unpadded_idata); @@ -119,6 +121,28 @@ namespace StreamCompaction { timer().endGpuTimer(); } + __global__ void kernGetPaddedBoolArray(int n, int paddedLength, int* odata, const int* idata) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index < n) { + odata[index] = idata[index] == 0 ? 0 : 1; + } + else if (index < paddedLength) { + odata[index] = 0; + } + } + + __global__ void kernScatter(int n, int* odata, int* idata, int *boolData, int *boolScan) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + + if (boolData[index] == 1) { + int writeIndex = boolScan[index]; + odata[writeIndex] = idata[index]; + } + } + /** * Performs stream compaction on idata, storing the result into odata. * All zeroes are discarded. @@ -131,8 +155,52 @@ namespace StreamCompaction { int compact(int n, int *odata, const int *idata) { timer().startGpuTimer(); // TODO + int* dev_boolData; + int* dev_boolScan; + int* dev_idata; + int* dev_odata; + int exponent = ilog2ceil(n); + int paddedLength = pow(2, exponent); + + cudaMalloc((void**)&dev_boolData, paddedLength * sizeof(int)); + cudaMalloc((void**)&dev_boolScan, paddedLength * sizeof(int)); + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + cudaMalloc((void**)&dev_odata, n * sizeof(int)); // don't need padding for scatter step + // copy input data + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + dim3 fullBlocksPerGrid((paddedLength + blockSize - 1) / blockSize); + kernGetPaddedBoolArray << > > (n, paddedLength, dev_boolData, dev_idata); + // copy dev_boolData since the scan implementation is destructive + cudaMemcpy(dev_boolScan, dev_boolData, paddedLength * sizeof(int), cudaMemcpyDeviceToDevice); + scanImpl(paddedLength, dev_boolScan); + + dim3 fullBlocksPerGridNonPadded((n + blockSize - 1) / blockSize); + kernScatter << > > + (n, dev_odata, dev_idata, dev_boolData, dev_boolScan); + + cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost); + + // How to get the total # of elements compacted: + // Get last (index n - 1) element of dev_boolScan and last (index n - 1) element of dev_boolData + // if dev_boolData[n - 1] = 0, then it's the last element + // Otherwise it's the last element + 1 + + int lastElementIsIncluded, lastBoolScanVal, resultLength; + + cudaMemcpy(&lastElementIsIncluded, dev_boolData + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(&lastBoolScanVal, dev_boolScan + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + + resultLength = lastElementIsIncluded == 1 ? lastBoolScanVal + 1 : lastBoolScanVal; + + cudaFree(dev_boolData); + cudaFree(dev_boolScan); + cudaFree(dev_idata); + cudaFree(dev_odata); + timer().endGpuTimer(); - return -1; + + return resultLength; } } } From 303672fe02eeb210cb9c3fcee20a47a4e691135b Mon Sep 17 00:00:00 2001 From: Constance Wang Date: Sat, 17 Sep 2022 22:03:52 -0400 Subject: [PATCH 09/17] update gpu timers and add thrust scan --- stream_compaction/efficient.cu | 20 +++++++++++++------- stream_compaction/naive.cu | 6 ++++-- stream_compaction/thrust.cu | 15 +++++++++++++++ 3 files changed, 32 insertions(+), 9 deletions(-) diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 4fabf61..025bf38 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -94,7 +94,6 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); // TODO // Pad array int* dev_unpadded_idata; @@ -110,15 +109,20 @@ namespace StreamCompaction { cudaMemcpy(dev_unpadded_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); checkCUDAErrorFn("Cuda memcpy idata no work"); + // start timer after all initial memcpys + timer().startGpuTimer(); + kernPadArray << > > (n, paddedLength, dev_idata, dev_unpadded_idata); scanImpl(paddedLength, dev_idata); + + // end timer before copying result array + timer().endGpuTimer(); + cudaMemcpy(odata, dev_idata, n * sizeof(int), cudaMemcpyDeviceToHost); cudaFree(dev_unpadded_idata); cudaFree(dev_idata); - - timer().endGpuTimer(); } __global__ void kernGetPaddedBoolArray(int n, int paddedLength, int* odata, const int* idata) { @@ -153,7 +157,6 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { - timer().startGpuTimer(); // TODO int* dev_boolData; int* dev_boolScan; @@ -166,9 +169,11 @@ namespace StreamCompaction { cudaMalloc((void**)&dev_boolScan, paddedLength * sizeof(int)); cudaMalloc((void**)&dev_idata, n * sizeof(int)); cudaMalloc((void**)&dev_odata, n * sizeof(int)); // don't need padding for scatter step - // copy input data cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + // Start timer after copying input data + timer().startGpuTimer(); + dim3 fullBlocksPerGrid((paddedLength + blockSize - 1) / blockSize); kernGetPaddedBoolArray << > > (n, paddedLength, dev_boolData, dev_idata); // copy dev_boolData since the scan implementation is destructive @@ -179,6 +184,9 @@ namespace StreamCompaction { kernScatter << > > (n, dev_odata, dev_idata, dev_boolData, dev_boolScan); + // End timer before copying odata out + timer().endGpuTimer(); + cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost); // How to get the total # of elements compacted: @@ -198,8 +206,6 @@ namespace StreamCompaction { cudaFree(dev_idata); cudaFree(dev_odata); - timer().endGpuTimer(); - return resultLength; } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 873aba8..a9da4c8 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -60,7 +60,6 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); // TODO // Add 1 because we're going to offset by a zero for exclusive scan @@ -75,6 +74,8 @@ namespace StreamCompaction { cudaMemcpy(dev_unpadded_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); checkCUDAErrorWithLine("memcpy idata failed!"); + timer().startGpuTimer(); + dim3 fullBlocksPerGrid((paddedLength + blockSize - 1) / blockSize); kernShiftAndPadInput<<>> @@ -92,6 +93,8 @@ namespace StreamCompaction { std::swap(dev_odata, dev_idata); } + timer().endGpuTimer(); + // Now result buffer is in dev_idata // We only need the first n elements of the total paddedLength elements cudaMemcpy(odata, dev_idata, n * sizeof(int), cudaMemcpyDeviceToHost); @@ -100,7 +103,6 @@ namespace StreamCompaction { cudaFree(dev_idata); cudaFree(dev_odata); cudaFree(dev_unpadded_idata); - timer().endGpuTimer(); } } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e..27831c0 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -18,11 +18,26 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int* dev_idata; + int* dev_odata; + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + thrust::device_ptr thrust_dev_idata(dev_idata); + thrust::device_ptr thrust_dev_odata(dev_odata); + + // Start timer after input data is copied timer().startGpuTimer(); // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::exclusive_scan(thrust_dev_idata, thrust_dev_idata + n, thrust_dev_odata); + timer().endGpuTimer(); + + // Copy the data back (not included in timing) + cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost); } } } From afc875f44e464141810d34e37ad06cc294b012f4 Mon Sep 17 00:00:00 2001 From: Constance Wang Date: Sun, 18 Sep 2022 19:51:02 -0400 Subject: [PATCH 10/17] working radish sort --- performance-at-last-commit.txt | 55 +++++++++++++++++++ src/main.cpp | 37 ++++++++++++- stream_compaction/common.cu | 20 +++++++ stream_compaction/common.h | 4 ++ stream_compaction/cpu.cu | 44 ++++++++++++++++ stream_compaction/cpu.h | 2 + stream_compaction/efficient.cu | 25 +-------- stream_compaction/efficient.h | 2 + stream_compaction/naive.cu | 96 ++++++++++++++++++++++++++++++++++ stream_compaction/naive.h | 1 + 10 files changed, 262 insertions(+), 24 deletions(-) create mode 100644 performance-at-last-commit.txt diff --git a/performance-at-last-commit.txt b/performance-at-last-commit.txt new file mode 100644 index 0000000..b1c619c --- /dev/null +++ b/performance-at-last-commit.txt @@ -0,0 +1,55 @@ +On length = 1 << 18 + +**************** +** SCAN TESTS ** +**************** + [ 4 25 22 2 14 31 30 0 44 13 11 45 24 ... 5 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 0.3819ms (std::chrono Measured) + [ 0 4 29 51 53 67 98 128 128 172 185 196 241 ... 6433433 6433438 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 0.4017ms (std::chrono Measured) + [ 0 4 29 51 53 67 98 128 128 172 185 196 241 ... 6433393 6433393 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 0.327168ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.229856ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.331456ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.343808ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.146592ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.190304ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 3 0 0 1 3 3 3 3 1 1 1 1 2 ... 1 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 0.5161ms (std::chrono Measured) + [ 3 1 3 3 3 3 1 1 1 1 2 3 3 ... 2 1 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 0.5447ms (std::chrono Measured) + [ 3 1 3 3 3 3 1 1 1 1 2 3 3 ... 2 3 ] + passed +==== cpu compact with scan ==== + elapsed time: 1.4394ms (std::chrono Measured) + [ 3 1 3 3 3 3 1 1 1 1 2 3 3 ... 2 1 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 0.3688ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 0.500928ms (CUDA Measured) + passed +Appuyez sur une touche pour continuer... \ No newline at end of file diff --git a/src/main.cpp b/src/main.cpp index 6ec45c8..1abbd00 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,7 +13,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array = 256 +const int SIZE = 1 << 15; // feel free to change the size of array = 256 const int NPOT = SIZE - 3; // Non-Power-Of-Two = 253 int *a = new int[SIZE]; int *b = new int[SIZE]; @@ -147,6 +147,41 @@ int main(int argc, char* argv[]) { //printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); + printf("\n"); + printf("*****************************\n"); + printf("** RADIX SORT TESTS **\n"); + printf("*****************************\n"); + + genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case + a[SIZE - 1] = 0; + printArray(SIZE, a, true); + + #define RADIX_NUM_BITS ilog2ceil(4) + + zeroArray(SIZE, b); + printDesc("cpu radix sort, power-of-two"); + StreamCompaction::CPU::radixSort(SIZE, RADIX_NUM_BITS, b, a); + //printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + printArray(SIZE, b, true); + + zeroArray(SIZE, c); + printDesc("radix sort, power of two"); + StreamCompaction::Naive::radixSort(SIZE, RADIX_NUM_BITS, c, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printCmpResult(SIZE, b, c); // TODO: add cpu impl and write to b for comparison + + zeroArray(SIZE, b); + printDesc("cpu radix sort, non-power-of-two"); + StreamCompaction::CPU::radixSort(NPOT, RADIX_NUM_BITS, b, a); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + printArray(SIZE, b, true); + + zeroArray(SIZE, c); + printDesc("radix sort, non-power of two"); + StreamCompaction::Naive::radixSort(NPOT, RADIX_NUM_BITS, c, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printCmpResult(NPOT, b, c); // TODO: add cpu impl and write to b for comparison + system("pause"); // stop Win32 console from closing on exit delete[] a; delete[] b; diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 58bb3b1..37f6a8f 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -34,5 +34,25 @@ namespace StreamCompaction { // TODO } + // unlike naive impl, this one doesn't shift the array + __global__ void kernPadArray(int n, int paddedLen, int* odata, const int* idata) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index < n) { + odata[index] = idata[index]; + } + else if (index < paddedLen) { + odata[index] = 0; + } + } + + __global__ void kernGetPaddedBoolArray(int n, int paddedLength, int* odata, const int* idata) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index < n) { + odata[index] = idata[index] == 0 ? 0 : 1; + } + else if (index < paddedLength) { + odata[index] = 0; + } + } } } diff --git a/stream_compaction/common.h b/stream_compaction/common.h index 5405eeb..575ce35 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -47,6 +47,10 @@ namespace StreamCompaction { __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices); + __global__ void kernPadArray(int n, int paddedLength, int* odata, const int* idata); + + __global__ void kernGetPaddedBoolArray(int n, int paddedLength, int* odata, const int* idata); + /** * This class is used for timing the performance * Uncopyable and unmovable diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index cf65471..eee979c 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -97,5 +97,49 @@ namespace StreamCompaction { return outIndex + 1; } + + void split(int n, int bitmask, int* odata, const int* idata) { + // write 0s to odata in 1 pass, 1s in the next pass + int zeroCount = 0; + for (int i = 0; i < n; ++i) { + if ((idata[i] & bitmask) == 0) { + odata[zeroCount] = idata[i]; + ++zeroCount; // eg. we write a total of 123 elts with 0s, then indices 0...122s are filled + } + } + + int oneCount = 0; + for (int i = 0; i < n; ++i) { + if ((idata[i] & bitmask) != 0) { + odata[zeroCount + oneCount] = idata[i]; + ++oneCount; + } + } + } + + // CPU radix sort implementation + void radixSort(int n, int numBits, int* odata, const int* idata) + { + // use temporary buffers so we don't destroy idata + int* temp_odata = (int*) malloc(n * sizeof(int)); + int* temp_odata2 = (int*)malloc(n * sizeof(int)); + if (temp_odata == NULL || temp_odata2 == NULL) { + printf("ERROR - failed to allocate temp_odata"); + return; + } + memcpy(temp_odata, idata, n * sizeof(int)); + + int bitmask = 1; + for (int i = 0; i < numBits; ++i) { + split(n, bitmask, temp_odata2, temp_odata); + std::swap(temp_odata2, temp_odata); // at end of loop, temp_odata always has most updated info + bitmask <<= 1; + } + + memcpy(odata, temp_odata, n * sizeof(int)); + + free(temp_odata); + free(temp_odata2); + } } } diff --git a/stream_compaction/cpu.h b/stream_compaction/cpu.h index 873c047..9d9e8cd 100644 --- a/stream_compaction/cpu.h +++ b/stream_compaction/cpu.h @@ -11,5 +11,7 @@ namespace StreamCompaction { int compactWithoutScan(int n, int *odata, const int *idata); int compactWithScan(int n, int *odata, const int *idata); + + void radixSort(int n, int numBits, int* odata, const int* idata); } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 025bf38..a6d6010 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -14,17 +14,6 @@ namespace StreamCompaction { return timer; } - // unlike naive impl, this one doesn't shift the array - __global__ void kernPadArray(int n, int paddedLen, int* odata, const int* idata) { - int index = threadIdx.x + (blockIdx.x * blockDim.x); - if (index < n) { - odata[index] = idata[index]; - } - else if (index < paddedLen) { - odata[index] = 0; - } - } - __global__ void kernUpsweep(int numThreads, int readStride, int* data) { int index = threadIdx.x + (blockIdx.x * blockDim.x); if (index >= numThreads) { @@ -112,7 +101,7 @@ namespace StreamCompaction { // start timer after all initial memcpys timer().startGpuTimer(); - kernPadArray << > > (n, paddedLength, dev_idata, dev_unpadded_idata); + Common::kernPadArray << > > (n, paddedLength, dev_idata, dev_unpadded_idata); scanImpl(paddedLength, dev_idata); @@ -125,16 +114,6 @@ namespace StreamCompaction { cudaFree(dev_idata); } - __global__ void kernGetPaddedBoolArray(int n, int paddedLength, int* odata, const int* idata) { - int index = threadIdx.x + (blockIdx.x * blockDim.x); - if (index < n) { - odata[index] = idata[index] == 0 ? 0 : 1; - } - else if (index < paddedLength) { - odata[index] = 0; - } - } - __global__ void kernScatter(int n, int* odata, int* idata, int *boolData, int *boolScan) { int index = threadIdx.x + (blockIdx.x * blockDim.x); if (index >= n) { @@ -175,7 +154,7 @@ namespace StreamCompaction { timer().startGpuTimer(); dim3 fullBlocksPerGrid((paddedLength + blockSize - 1) / blockSize); - kernGetPaddedBoolArray << > > (n, paddedLength, dev_boolData, dev_idata); + Common::kernGetPaddedBoolArray << > > (n, paddedLength, dev_boolData, dev_idata); // copy dev_boolData since the scan implementation is destructive cudaMemcpy(dev_boolScan, dev_boolData, paddedLength * sizeof(int), cudaMemcpyDeviceToDevice); scanImpl(paddedLength, dev_boolScan); diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 803cb4f..b19f4d5 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -8,6 +8,8 @@ namespace StreamCompaction { void scan(int n, int *odata, const int *idata); + void scanImpl(int paddedLength, int* dev_data); + int compact(int n, int *odata, const int *idata); } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index a9da4c8..7e4ba53 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -2,6 +2,7 @@ #include #include "common.h" #include "naive.h" +#include "efficient.h" #include #include @@ -104,5 +105,100 @@ namespace StreamCompaction { cudaFree(dev_odata); cudaFree(dev_unpadded_idata); } + + // oTrueData[index] is 1 if idata[index] has bit 1 at position i + // oFalseData is just inverted oTrueData + // bit mask is of the form (in binary) 000...1...00 + __global__ void kernGetPaddedBoolArray + (int n, int paddedLength, int bitMask, int* falsesData, int* idata) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index < n) { + falsesData[index] = ((bitMask & idata[index]) == 0) ? 1 : 0; + } + else if (index < paddedLength) { + // Since we pad at the end, we need to pad end elements with bit = 1 (aka. falses = 0) + falsesData[index] = 0; + } + } + + __global__ void kernGetScatterAddresses(int n, int *odata, const int* falsesData, const int *falsesScanData, int totalFalses) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + odata[index] = falsesData[index] == 1 + ? falsesScanData[index] + : index - falsesScanData[index] + totalFalses; // the "true" scan data offset by totalFalses + } + + __global__ void kernScatter(int n, int* odata, const int* idata, const int* scatterAddresses) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + odata[scatterAddresses[index]] = idata[index]; + } + + // Let's not worry about 2's complement for now... + // Assume we only have positive integers + void radixSort(int n, int numBits, int* odata, const int* idata) { + int* dev_idata, * dev_odata; // two buffers needed because scatter step has race conditions + int *dev_falsesData, * dev_falsesScanData, * dev_scatterAddresses; + + int exponent = ilog2ceil(n); + int paddedLength = pow(2, exponent); + dim3 fullBlocksPerGridUnpadded((n + blockSize - 1) / blockSize); + dim3 fullBlocksPerGridPadded((paddedLength + blockSize - 1) / blockSize); + + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + + cudaMalloc((void**)&dev_falsesData, paddedLength * sizeof(int)); + cudaMalloc((void**)&dev_falsesScanData, paddedLength * sizeof(int)); + cudaMalloc((void**)&dev_scatterAddresses, n * sizeof(int)); + + int bitmask = 1; + for (int i = 0; i < numBits; ++i) { + // need indices from 0... paddedLength - 1 + kernGetPaddedBoolArray << < fullBlocksPerGridPadded, blockSize >> > + (n, paddedLength, bitmask, dev_falsesData, dev_idata); + + cudaMemcpy(dev_falsesScanData, dev_falsesData, paddedLength * sizeof(int), cudaMemcpyDeviceToDevice); + Efficient::scanImpl(paddedLength, dev_falsesScanData); + + // Calculate total falses = dev_falsesScanData[n-1] + dev_falsesData[n-1] + // TODO: just make it a function + int lastElementIsIncluded, lastBoolScanVal, totalFalses; + cudaMemcpy(&lastElementIsIncluded, dev_falsesData + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(&lastBoolScanVal, dev_falsesScanData + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + totalFalses = lastElementIsIncluded + lastBoolScanVal; + + kernGetScatterAddresses << > > + (n, dev_scatterAddresses, dev_falsesData, dev_falsesScanData, totalFalses); + + kernScatter<<>>(n, dev_odata, dev_idata, dev_scatterAddresses); + + cudaDeviceSynchronize(); + + //printf("MEOW total falses = %d\n", totalFalses); + //printCudaArray(n, dev_falsesData); + //printCudaArray(n, dev_falsesScanData); + //printCudaArray(n, dev_scatterAddresses); + //printCudaArray(n, dev_odata); + + bitmask = bitmask << 1; // eg. ...0010 => ...0100 + std::swap(dev_odata, dev_idata); // dev_idata always has latest + } + + cudaMemcpy(odata, dev_idata, n * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_idata); + cudaFree(dev_odata); + cudaFree(dev_falsesData); + cudaFree(dev_falsesScanData); + cudaFree(dev_scatterAddresses); + } } } diff --git a/stream_compaction/naive.h b/stream_compaction/naive.h index 37dcb06..86c4303 100644 --- a/stream_compaction/naive.h +++ b/stream_compaction/naive.h @@ -7,5 +7,6 @@ namespace StreamCompaction { StreamCompaction::Common::PerformanceTimer& timer(); void scan(int n, int *odata, const int *idata); + void radixSort(int n, int numBits, int* odata, const int* idata); } } From 34adb35e6a64e61c5ddf5113c0bfb3c31a5b0a1a Mon Sep 17 00:00:00 2001 From: Constance Wang Date: Sun, 18 Sep 2022 19:54:16 -0400 Subject: [PATCH 11/17] add timers --- stream_compaction/cpu.cu | 4 ++++ stream_compaction/naive.cu | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index eee979c..7196043 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -129,6 +129,8 @@ namespace StreamCompaction { } memcpy(temp_odata, idata, n * sizeof(int)); + timer().startCpuTimer(); + int bitmask = 1; for (int i = 0; i < numBits; ++i) { split(n, bitmask, temp_odata2, temp_odata); @@ -136,6 +138,8 @@ namespace StreamCompaction { bitmask <<= 1; } + timer().endCpuTimer(); // end before final memcpy (which just transfers output to odata) + memcpy(odata, temp_odata, n * sizeof(int)); free(temp_odata); diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 7e4ba53..7214625 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -159,6 +159,8 @@ namespace StreamCompaction { cudaMalloc((void**)&dev_falsesScanData, paddedLength * sizeof(int)); cudaMalloc((void**)&dev_scatterAddresses, n * sizeof(int)); + timer().startGpuTimer(); + int bitmask = 1; for (int i = 0; i < numBits; ++i) { // need indices from 0... paddedLength - 1 @@ -192,6 +194,8 @@ namespace StreamCompaction { std::swap(dev_odata, dev_idata); // dev_idata always has latest } + timer().endGpuTimer(); + cudaMemcpy(odata, dev_idata, n * sizeof(int), cudaMemcpyDeviceToHost); cudaFree(dev_idata); From 802fc2997a68ab78fffbf643b7853b365c2797a4 Mon Sep 17 00:00:00 2001 From: Constance Wang Date: Sun, 18 Sep 2022 21:05:38 -0400 Subject: [PATCH 12/17] remove unneeded padding from naive scan, fix test printing wrong number of elts --- README.md | 9 ++++----- src/main.cpp | 8 ++++---- stream_compaction/naive.cu | 41 ++++++++------------------------------ 3 files changed, 16 insertions(+), 42 deletions(-) diff --git a/README.md b/README.md index ad1c0bd..a4aff72 100644 --- a/README.md +++ b/README.md @@ -11,8 +11,7 @@ Tested on AORUS 15P XD laptop with specs: - 11th Gen Intel(R) Core(TM) i7-11800H @ 2.30GHz 2.30 GHz - NVIDIA GeForce RTX 3070 Laptop GPU -### (TODO: Your README) - -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) - +I implemented the following parallel algorithms on the GPU and benchmarked them against my own CPU implementations and Thrust on the GPU: +- Scan +- Stream compaction +- Radix sort \ No newline at end of file diff --git a/src/main.cpp b/src/main.cpp index 1abbd00..0abb168 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,7 +13,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 15; // feel free to change the size of array = 256 +const int SIZE = 1 << 12; // feel free to change the size of array = 256 const int NPOT = SIZE - 3; // Non-Power-Of-Two = 253 int *a = new int[SIZE]; int *b = new int[SIZE]; @@ -152,11 +152,11 @@ int main(int argc, char* argv[]) { printf("** RADIX SORT TESTS **\n"); printf("*****************************\n"); - genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case + genArray(SIZE - 1, a, 696969); // Leave a 0 at the end to test that edge case a[SIZE - 1] = 0; printArray(SIZE, a, true); - #define RADIX_NUM_BITS ilog2ceil(4) + #define RADIX_NUM_BITS ilog2ceil(696969) zeroArray(SIZE, b); printDesc("cpu radix sort, power-of-two"); @@ -174,7 +174,7 @@ int main(int argc, char* argv[]) { printDesc("cpu radix sort, non-power-of-two"); StreamCompaction::CPU::radixSort(NPOT, RADIX_NUM_BITS, b, a); printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(SIZE, b, true); + printArray(NPOT, b, true); zeroArray(SIZE, c); printDesc("radix sort, non-power of two"); diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 7214625..768b02c 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -19,9 +19,6 @@ namespace StreamCompaction { static PerformanceTimer timer; return timer; } - int* dev_idata; - int* dev_odata; - int* dev_unpadded_idata; // TODO: __global__ __global__ void kernScan(int n, int offset, int* odata, const int* idata) { @@ -38,52 +35,31 @@ namespace StreamCompaction { } } - // Pad the data with 1 zero at the beginning - // And enough zeroes at the end - // odata should be buffer of size paddedLength = the next power of two after and including (n + 1) - __global__ void kernShiftAndPadInput(int n, int paddedLength, int* odata, int* idata) { - int index = threadIdx.x + (blockIdx.x * blockDim.x); - if (index == 0) { - odata[index] = 0; - } - else if (index <= n) { - odata[index] = idata[index - 1]; - } - else if (index < paddedLength) { - odata[index] = 0; - } - else { - return; - } - } - /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { // TODO - + int* dev_idata; + int* dev_odata; + // Add 1 because we're going to offset by a zero for exclusive scan + int paddedLength = n + 1; int exponent = ilog2ceil(n + 1); - int paddedLength = pow(2, exponent); - // Input and output should be padded by 1s and 0s - cudaMalloc((void**)&dev_unpadded_idata, n * sizeof(int)); + cudaMalloc((void**)&dev_idata, paddedLength * sizeof(int)); cudaMalloc((void**)&dev_odata, paddedLength * sizeof(int)); checkCUDAErrorWithLine("cudaMalloc failed"); - cudaMemcpy(dev_unpadded_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + int identity = 0; + cudaMemcpy(dev_idata, &identity, sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(dev_idata + 1, idata, n * sizeof(int), cudaMemcpyHostToDevice); checkCUDAErrorWithLine("memcpy idata failed!"); timer().startGpuTimer(); dim3 fullBlocksPerGrid((paddedLength + blockSize - 1) / blockSize); - kernShiftAndPadInput<<>> - (n, paddedLength, dev_idata, dev_unpadded_idata); - - //printCudaArray(paddedLength, dev_idata); - for (int d = 1; d <= exponent; ++d) { int offset = pow(2, d - 1); kernScan << < fullBlocksPerGrid, blockSize >> > (paddedLength, offset, dev_odata, dev_idata); @@ -103,7 +79,6 @@ namespace StreamCompaction { cudaFree(dev_idata); cudaFree(dev_odata); - cudaFree(dev_unpadded_idata); } // oTrueData[index] is 1 if idata[index] has bit 1 at position i From b9db98105eeb8c6307d8bf21a1194f8ec86bcb09 Mon Sep 17 00:00:00 2001 From: Constance Wang Date: Sun, 18 Sep 2022 22:26:04 -0400 Subject: [PATCH 13/17] update readme --- README.md | 118 +++++++++++++++++- ...plementations on Different Array Sizes.svg | 1 + src/main.cpp | 2 +- 3 files changed, 118 insertions(+), 3 deletions(-) create mode 100644 img/Performance of Scan Implementations on Different Array Sizes.svg diff --git a/README.md b/README.md index a4aff72..bcf05a1 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,120 @@ Tested on AORUS 15P XD laptop with specs: - NVIDIA GeForce RTX 3070 Laptop GPU I implemented the following parallel algorithms on the GPU and benchmarked them against my own CPU implementations and Thrust on the GPU: -- Scan +- Naive Scan +- Work efficient scan - Stream compaction -- Radix sort \ No newline at end of file +- Radix sort + +I roughly optimized the block size for each algorithm by seeing what block size performed the fastest on arrays of size 2^22 (approx 4 million). + +| Block Size | Runtime - Naive (ms) | Runtime - Work efficient (ms) | +| ----------- | ----------- | ----------- | +32 | 4.21734 | 1.32106 +64 |2.11395 |1.36259 +128 |2.09267| 1.26221 +256| 2.09258 |1.28563 +384 |2.11395 |1.3327 +768 |2.11405| 1.26701 + +Performance was pretty similar for most block sizes, but started to suffer for both naive and work efficient at around 32 or 64 threads per block. In this case, I decided to use a block size of 128 threads to compare the algorithms on different array sizes. + +![](img/Performance%20of%20Scan%20Implementations%20on%20Different%20Array%20Sizes.svg) + +| Array size | CPU (m/s) | Naive (m/s) | Work efficient (m/s) | Thrust (m/s) | +|------------|---------|----------|----------------|----------| +| 65536 | 0.1023 | 0.158464 | 0.266592 | 0.045472 | +| 262144 | 0.399 | 0.2616 | 0.33888 | 0.194144 | +| 1048576 | 1.6835 | 0.636288 | 0.472416 | 0.351648 | +| 4194304 | 6.392 | 2.20544 | 1.27302 | 0.523776 | +| 16777216 | 25.5751 | 8.98938 | 4.05302 | 1.09213 | +| 67108864 | 100.736 | 38.8708 | 15.4414 | 2.14362 | +| 268435456 | 410.365 | 169.486 | 60.6265 | 6.23341 | + +### Extra credit + +#### Performance +My work efficient scan halves the total number of threads launched each iteration, so the work efficient scan is faster at than naive and CPU implementation at array sizes of 2^18 and larger. This means less threads are idling and taking up space on the GPU multi-processors. + +#### Radix sort + +I implemented Radix sort on the CPU, wrote two test cases, and added Radix sort on the GPU, which calls my work efficient scan. The functions to look at are: +- `radixSort` in `naive.cu` +- `radixSort` in `cpu.cu` + +A few notes: you can pass in the number of bits you want to sort by, which should be `ilog2ceil(MAX_ARRAY_ELEMENT_VALUE)`. Also, I assumed for simplicity that each element is a positive integer (although still using int and not unsigned int types) so I can just use a bitmask to compact the arrays. Finally, to test, the array size should not be too close to 2^31 because of integer overflow issues... + +#### Sample program output +Settings: `blockSize` = 128, `SIZE` = 1 << 20 + +``` + +**************** +** SCAN TESTS ** +**************** + [ 8 40 25 15 22 4 39 46 44 28 2 44 33 ... 30 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 1.5927ms (std::chrono Measured) + [ 0 8 48 73 88 110 114 153 199 243 271 273 317 ... 25674057 25674087 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 1.7177ms (std::chrono Measured) + [ 0 8 48 73 88 110 114 153 199 243 271 273 317 ... 25673959 25673964 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 0.634144ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.602528ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.457504ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.479328ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.340864ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.279648ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 0 2 2 3 1 2 1 1 2 2 0 3 ... 0 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 2.2561ms (std::chrono Measured) + [ 2 2 2 3 1 2 1 1 2 2 3 1 2 ... 1 1 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 2.1692ms (std::chrono Measured) + [ 2 2 2 3 1 2 1 1 2 2 3 1 2 ... 1 1 ] + passed +==== cpu compact with scan ==== + elapsed time: 5.498ms (std::chrono Measured) + [ 2 2 2 3 1 2 1 1 2 2 3 1 2 ... 1 1 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 0.541312ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 0.490016ms (CUDA Measured) + passed + +***************************** +** RADIX SORT TESTS ** +***************************** + [ 23914 19768 8186 18274 8083 25457 19314 28109 22549 17382 25706 27432 8575 ... 4256 0 ] +==== cpu radix sort, power-of-two ==== + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] +==== radix sort, power of two ==== + elapsed time: 0.490016ms (CUDA Measured) + passed +==== cpu radix sort, non-power-of-two ==== + elapsed time: 109.43ms (std::chrono Measured) + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] +==== radix sort, non-power of two ==== + elapsed time: 0.490016ms (CUDA Measured) + passed +``` \ No newline at end of file diff --git a/img/Performance of Scan Implementations on Different Array Sizes.svg b/img/Performance of Scan Implementations on Different Array Sizes.svg new file mode 100644 index 0000000..db0a1e5 --- /dev/null +++ b/img/Performance of Scan Implementations on Different Array Sizes.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/src/main.cpp b/src/main.cpp index 0abb168..3984f60 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,7 +13,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 12; // feel free to change the size of array = 256 +const int SIZE = 1 << 20; // feel free to change the size of array = 256 const int NPOT = SIZE - 3; // Non-Power-Of-Two = 253 int *a = new int[SIZE]; int *b = new int[SIZE]; From dafa0e4ba4115c5dd09cc74b470994da4f18cf89 Mon Sep 17 00:00:00 2001 From: Constance Wang Date: Sun, 18 Sep 2022 23:08:06 -0400 Subject: [PATCH 14/17] Update README.md --- README.md | 68 +++++++++++++++++++++++++++++++++---------------------- 1 file changed, 41 insertions(+), 27 deletions(-) diff --git a/README.md b/README.md index bcf05a1..bbe432c 100644 --- a/README.md +++ b/README.md @@ -42,10 +42,23 @@ Performance was pretty similar for most block sizes, but started to suffer for b | 67108864 | 100.736 | 38.8708 | 15.4414 | 2.14362 | | 268435456 | 410.365 | 169.486 | 60.6265 | 6.23341 | +### Analysis +The CPU implementation's run time appears to be linear with respect to the number of array elements. This makes sense because each element is processed one at a time inside a for loop. + +Thrust is the fastest by far. This is probably because they are using shared memory, while all of my implementations only use global memory which is much slower to access, making each kernel thread slower. And maybe other optimizations as well. + +The work-efficient scan is faster than the naive scan. This should be because I made optimizations (see next section) to reduce the number of threads at each iteration, whereas the naive scan still launches n threads each iteration. + +In all implementations, computation should not be a performance bottleneck, since each kernel runs in about O(1) time, we can't really do better than that. + +Aside from the above trends, memory IO (cudaMemcpy) is a giant performance bottleneck. This is not shown in the performance graph since we start measuring runtime after the initial cudaMemcpy and stop measuring before the final cudaMemcpy. Still, cudaMemcpy runs in O(n) time, which effectively makes any GPU algorithm O(n), even though the actual implementation of scan runs in O(log n). + +However, in practice, cudaMemcpy is still very fast, probably because the hardware bandwith for copying data from host to device is very large. For example, on an array of size 2^26, I ran my Radix sort algorithms and the CPU implementation took about 7 seconds (7131.1ms). Meanwhile the GPU implementation took about 1 second (826.585ms) including the cudaMemcpy, and half a second (434.71ms) without the cudaMemcpy. This means that while the cudaMemcpy is still a huge bottleneck on the GPU performance (taking up about half the runtime), it isn't too the point of being linear time, even at large numbers. In the future, I could try to measure the bandwidth of cudaMemcpy on my GPU. + ### Extra credit #### Performance -My work efficient scan halves the total number of threads launched each iteration, so the work efficient scan is faster at than naive and CPU implementation at array sizes of 2^18 and larger. This means less threads are idling and taking up space on the GPU multi-processors. +My work efficient scan halves the total number of threads launched each iteration, this means less threads are idling and taking up space on the GPU multi-processors while other threads could be running. As a result, the work efficient scan is faster at than naive and CPU implementation at array sizes of 2^18 and larger. #### Radix sort @@ -56,76 +69,77 @@ I implemented Radix sort on the CPU, wrote two test cases, and added Radix sort A few notes: you can pass in the number of bits you want to sort by, which should be `ilog2ceil(MAX_ARRAY_ELEMENT_VALUE)`. Also, I assumed for simplicity that each element is a positive integer (although still using int and not unsigned int types) so I can just use a bitmask to compact the arrays. Finally, to test, the array size should not be too close to 2^31 because of integer overflow issues... #### Sample program output -Settings: `blockSize` = 128, `SIZE` = 1 << 20 +Settings: `blockSize` = 128, `SIZE` = 1 << 26 ``` **************** ** SCAN TESTS ** **************** - [ 8 40 25 15 22 4 39 46 44 28 2 44 33 ... 30 0 ] + [ 29 25 4 37 8 30 21 31 8 21 22 19 3 ... 49 0 ] ==== cpu scan, power-of-two ==== - elapsed time: 1.5927ms (std::chrono Measured) - [ 0 8 48 73 88 110 114 153 199 243 271 273 317 ... 25674057 25674087 ] + elapsed time: 102.046ms (std::chrono Measured) + [ 0 29 54 58 95 103 133 154 185 193 214 236 255 ... 1643658734 1643658783 ] ==== cpu scan, non-power-of-two ==== - elapsed time: 1.7177ms (std::chrono Measured) - [ 0 8 48 73 88 110 114 153 199 243 271 273 317 ... 25673959 25673964 ] + elapsed time: 99.084ms (std::chrono Measured) + [ 0 29 54 58 95 103 133 154 185 193 214 236 255 ... 1643658648 1643658670 ] passed ==== naive scan, power-of-two ==== - elapsed time: 0.634144ms (CUDA Measured) + elapsed time: 38.8846ms (CUDA Measured) passed ==== naive scan, non-power-of-two ==== - elapsed time: 0.602528ms (CUDA Measured) + elapsed time: 37.4897ms (CUDA Measured) passed ==== work-efficient scan, power-of-two ==== - elapsed time: 0.457504ms (CUDA Measured) + elapsed time: 15.4577ms (CUDA Measured) passed ==== work-efficient scan, non-power-of-two ==== - elapsed time: 0.479328ms (CUDA Measured) + elapsed time: 15.4086ms (CUDA Measured) passed ==== thrust scan, power-of-two ==== - elapsed time: 0.340864ms (CUDA Measured) + elapsed time: 2.09901ms (CUDA Measured) passed ==== thrust scan, non-power-of-two ==== - elapsed time: 0.279648ms (CUDA Measured) + elapsed time: 2.36995ms (CUDA Measured) passed ***************************** ** STREAM COMPACTION TESTS ** ***************************** - [ 2 0 2 2 3 1 2 1 1 2 2 0 3 ... 0 0 ] + [ 0 3 2 3 3 1 3 2 2 3 1 3 0 ... 3 0 ] ==== cpu compact without scan, power-of-two ==== - elapsed time: 2.2561ms (std::chrono Measured) - [ 2 2 2 3 1 2 1 1 2 2 3 1 2 ... 1 1 ] + elapsed time: 137.514ms (std::chrono Measured) + [ 3 2 3 3 1 3 2 2 3 1 3 1 3 ... 2 3 ] passed ==== cpu compact without scan, non-power-of-two ==== - elapsed time: 2.1692ms (std::chrono Measured) - [ 2 2 2 3 1 2 1 1 2 2 3 1 2 ... 1 1 ] + elapsed time: 137.567ms (std::chrono Measured) + [ 3 2 3 3 1 3 2 2 3 1 3 1 3 ... 2 1 ] passed ==== cpu compact with scan ==== - elapsed time: 5.498ms (std::chrono Measured) - [ 2 2 2 3 1 2 1 1 2 2 3 1 2 ... 1 1 ] + elapsed time: 348.893ms (std::chrono Measured) + [ 3 2 3 3 1 3 2 2 3 1 3 1 3 ... 2 3 ] passed ==== work-efficient compact, power-of-two ==== - elapsed time: 0.541312ms (CUDA Measured) + elapsed time: 19.1836ms (CUDA Measured) passed ==== work-efficient compact, non-power-of-two ==== - elapsed time: 0.490016ms (CUDA Measured) + elapsed time: 19.1201ms (CUDA Measured) passed ***************************** ** RADIX SORT TESTS ** ***************************** - [ 23914 19768 8186 18274 8083 25457 19314 28109 22549 17382 25706 27432 8575 ... 4256 0 ] + [ 31399 13580 25635 22845 23360 14322 9628 3467 20074 16251 14385 30083 26014 ... 230 0 ] ==== cpu radix sort, power-of-two ==== + elapsed time: 7131.1ms (std::chrono Measured) [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] ==== radix sort, power of two ==== - elapsed time: 0.490016ms (CUDA Measured) + elapsed time: 826.585ms (CUDA Measured) passed ==== cpu radix sort, non-power-of-two ==== - elapsed time: 109.43ms (std::chrono Measured) + elapsed time: 7102.31ms (std::chrono Measured) [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 32767 32767 ] ==== radix sort, non-power of two ==== - elapsed time: 0.490016ms (CUDA Measured) + elapsed time: 788.974ms (CUDA Measured) passed -``` \ No newline at end of file +``` From e7a4b08d7a9429813ffedeb1c21c885be02118b9 Mon Sep 17 00:00:00 2001 From: Constance Wang Date: Sun, 18 Sep 2022 23:08:22 -0400 Subject: [PATCH 15/17] fix timing measurement --- src/main.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 3984f60..6667cad 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,7 +13,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 20; // feel free to change the size of array = 256 +const int SIZE = 1 << 26; // feel free to change the size of array = 256 const int NPOT = SIZE - 3; // Non-Power-Of-Two = 253 int *a = new int[SIZE]; int *b = new int[SIZE]; @@ -161,13 +161,13 @@ int main(int argc, char* argv[]) { zeroArray(SIZE, b); printDesc("cpu radix sort, power-of-two"); StreamCompaction::CPU::radixSort(SIZE, RADIX_NUM_BITS, b, a); - //printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); printArray(SIZE, b, true); zeroArray(SIZE, c); printDesc("radix sort, power of two"); StreamCompaction::Naive::radixSort(SIZE, RADIX_NUM_BITS, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); printCmpResult(SIZE, b, c); // TODO: add cpu impl and write to b for comparison zeroArray(SIZE, b); @@ -179,7 +179,7 @@ int main(int argc, char* argv[]) { zeroArray(SIZE, c); printDesc("radix sort, non-power of two"); StreamCompaction::Naive::radixSort(NPOT, RADIX_NUM_BITS, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); printCmpResult(NPOT, b, c); // TODO: add cpu impl and write to b for comparison system("pause"); // stop Win32 console from closing on exit From 3036ae29140727a750312b19e336d991695b8524 Mon Sep 17 00:00:00 2001 From: Constance Wang Date: Sun, 18 Sep 2022 23:11:24 -0400 Subject: [PATCH 16/17] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index bbe432c..0c56d3e 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,7 @@ Performance was pretty similar for most block sizes, but started to suffer for b ![](img/Performance%20of%20Scan%20Implementations%20on%20Different%20Array%20Sizes.svg) -| Array size | CPU (m/s) | Naive (m/s) | Work efficient (m/s) | Thrust (m/s) | +| Array size | CPU (ms) | Naive (ms) | Work efficient (ms) | Thrust (ms) | |------------|---------|----------|----------------|----------| | 65536 | 0.1023 | 0.158464 | 0.266592 | 0.045472 | | 262144 | 0.399 | 0.2616 | 0.33888 | 0.194144 | From 502d1b30839e0cd344df8bb9af2881bb148c8eae Mon Sep 17 00:00:00 2001 From: Constance Wang Date: Sun, 18 Sep 2022 23:13:14 -0400 Subject: [PATCH 17/17] Update README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 0c56d3e..c0f022c 100644 --- a/README.md +++ b/README.md @@ -65,6 +65,8 @@ My work efficient scan halves the total number of threads launched each iteratio I implemented Radix sort on the CPU, wrote two test cases, and added Radix sort on the GPU, which calls my work efficient scan. The functions to look at are: - `radixSort` in `naive.cu` - `radixSort` in `cpu.cu` +Example of usage: +`StreamCompaction::CPU::radixSort(NPOT, RADIX_NUM_BITS, b, a);` A few notes: you can pass in the number of bits you want to sort by, which should be `ilog2ceil(MAX_ARRAY_ELEMENT_VALUE)`. Also, I assumed for simplicity that each element is a positive integer (although still using int and not unsigned int types) so I can just use a bitmask to compact the arrays. Finally, to test, the array size should not be too close to 2^31 because of integer overflow issues...