Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Project 2: Xiaoyu Du #31

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 72 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,77 @@ CUDA Stream Compaction

**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**

* (TODO) YOUR NAME HERE
* (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
* XiaoyuDu
* Tested on: Windows 10, i9-11900KF @ 3.50GHz, RTX 3080 (Personal PC)

### Description
This project tested for different method of scan and compact.

### Feature
I implemented all the features for part 1 - 5.
* CPU Scan & Stream Compaction
* Naive GPU Scan Algorithm
* Work-Efficient GPU Scan & Stream Compaction
* Thrust's Implementation
* GPU Work-Efficient Method Optimization

### Performance Analysis
My optimized number of blocks is 128.
I campared different method with different size array, and the result plot is shown below. I am a bit confused why my Thrust implementation takes so long to run. I think my implementation should be correct.
![](./images/1.png)

### (TODO: Your README)

Include analysis, etc. (Remember, this is public, so don't put
anything here that you don't want to share with the world.)
Below is the test result with 2^20 size array.
```
****************
** SCAN TESTS **
****************
[ 28 9 12 41 33 49 46 3 11 27 35 5 47 ... 8 0 ]
==== cpu scan, power-of-two ====
elapsed time: 1.7669ms (std::chrono Measured)
[ 0 28 37 49 90 123 172 218 221 232 259 294 299 ... 25674595 25674603 ]
==== cpu scan, non-power-of-two ====
elapsed time: 1.7544ms (std::chrono Measured)
[ 0 28 37 49 90 123 172 218 221 232 259 294 299 ... 25674502 25674539 ]
passed
==== naive scan, power-of-two ====
elapsed time: 0.510176ms (CUDA Measured)
passed
==== naive scan, non-power-of-two ====
elapsed time: 0.695424ms (CUDA Measured)
passed
==== work-efficient scan, power-of-two ====
elapsed time: 0.43328ms (CUDA Measured)
passed
==== work-efficient scan, non-power-of-two ====
elapsed time: 0.631104ms (CUDA Measured)
passed
==== thrust scan, power-of-two ====
elapsed time: 28.3783ms (CUDA Measured)
passed
==== thrust scan, non-power-of-two ====
elapsed time: 7.89008ms (CUDA Measured)
passed

*****************************
** STREAM COMPACTION TESTS **
*****************************
[ 1 2 1 1 1 2 0 2 2 3 0 3 3 ... 3 0 ]
==== cpu compact without scan, power-of-two ====
elapsed time: 3.7818ms (std::chrono Measured)
[ 1 2 1 1 1 2 2 2 3 3 3 2 3 ... 3 3 ]
passed
==== cpu compact without scan, non-power-of-two ====
elapsed time: 3.5542ms (std::chrono Measured)
[ 1 2 1 1 1 2 2 2 3 3 3 2 3 ... 3 3 ]
passed
==== cpu compact with scan ====
elapsed time: 9.8808ms (std::chrono Measured)
[ 1 2 1 1 1 2 2 2 3 3 3 2 3 ... 3 3 ]
passed
==== work-efficient compact, power-of-two ====
elapsed time: 1.07133ms (CUDA Measured)
passed
==== work-efficient compact, non-power-of-two ====
elapsed time: 0.849984ms (CUDA Measured)
passed
```
Binary file added images/1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
5 changes: 4 additions & 1 deletion src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
#include <stream_compaction/thrust.h>
#include "testing_helpers.hpp"

const int SIZE = 1 << 8; // feel free to change the size of array
const int SIZE = 1 << 20; // feel free to change the size of array
const int NPOT = SIZE - 3; // Non-Power-Of-Two
int *a = new int[SIZE];
int *b = new int[SIZE];
Expand Down Expand Up @@ -139,13 +139,16 @@ int main(int argc, char* argv[]) {
printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
//printArray(count, c, true);
printCmpLenResult(count, expectedCount, b, c);



zeroArray(SIZE, c);
printDesc("work-efficient compact, non-power-of-two");
count = StreamCompaction::Efficient::compact(NPOT, c, a);
printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
//printArray(count, c, true);
printCmpLenResult(count, expectedNPOT, b, c);


system("pause"); // stop Win32 console from closing on exit
delete[] a;
Expand Down
47 changes: 45 additions & 2 deletions stream_compaction/cpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@ namespace StreamCompaction {
void scan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
// TODO
odata[0] = 0;
for (int i = 1; i < n; ++i) {
odata[i] = odata[i - 1] + idata[i - 1];
}
timer().endCpuTimer();
}

Expand All @@ -31,8 +35,15 @@ namespace StreamCompaction {
int compactWithoutScan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
// TODO
int num = 0;
for (int i = 0; i < n; ++i) {
if (idata[i] != 0) {
odata[num] = idata[i];
++num;
}
}
timer().endCpuTimer();
return -1;
return num;
}

/**
Expand All @@ -43,8 +54,40 @@ namespace StreamCompaction {
int compactWithScan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
// TODO
int* boolArr = new int[n];
int* scanArr = new int[n];
int num = 0;

//build boolArr
for (int i = 0; i < n; ++i) {
if (idata[i] != 0) {
boolArr[i] = 1;
}
else {
boolArr[i] = 0;
}
}

//build scanArr
scanArr[0] = 0;
for (int i = 1; i < n; ++i) {
scanArr[i] = boolArr[i - 1] + scanArr[i - 1];
}

//fill odata
for (int i = 0; i < n; ++i) {
if (boolArr[i] == 1) {
odata[scanArr[i]] = idata[i];
}
}

//calculate num to return
num = scanArr[n - 1] + boolArr[n - 1];

delete[] boolArr;
delete[] scanArr;
timer().endCpuTimer();
return -1;
return num;
}
}
}
145 changes: 143 additions & 2 deletions stream_compaction/efficient.cu
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
#include "common.h"
#include "efficient.h"

#define blockSize 128

namespace StreamCompaction {
namespace Efficient {
using StreamCompaction::Common::PerformanceTimer;
Expand All @@ -12,13 +14,94 @@ namespace StreamCompaction {
return timer;
}

__global__ void kernUpSweep(int threadNeeded, int d, int* dev_idata) {
int index = threadIdx.x + (blockIdx.x * blockDim.x);
//increase1 2^(d+1), increase2 2^d
if (index < threadNeeded) {
int increase1 = 1 << (d + 1);
int increase2 = 1 << d;
int multiIdx = index * increase1;
dev_idata[multiIdx + increase1 - 1] += dev_idata[multiIdx + increase2 - 1];
}
}

__global__ void kernDownSweep(int threadNeeded, int d, int* dev_idata){
int index = threadIdx.x + (blockIdx.x * blockDim.x);
if (index < threadNeeded) {
int increase1 = 1 << (d + 1);
int increase2 = 1 << d;
int multiIdx = index * increase1;
int t = dev_idata[multiIdx + increase2 - 1];
dev_idata[multiIdx + increase2 - 1] = dev_idata[multiIdx + increase1 - 1];
dev_idata[multiIdx + increase1 - 1] += t;
}
}

__global__ void kernMapToBoolean(int n, int* temp_Arr, int* dev_idata) {
int index = threadIdx.x + (blockIdx.x * blockDim.x);
if (index < n) {
if (dev_idata[index] != 0) {
temp_Arr[index] = 1;
}
}
}

__global__ void kernScatter(int n, int* dev_tempArr, int* dev_finalArr, int* dev_idata) {
int index = threadIdx.x + (blockIdx.x * blockDim.x);
if (index < (n - 1)) {
int currScan = dev_tempArr[index];
int nextScan = dev_tempArr[index + 1];
if (currScan < nextScan) {
dev_finalArr[currScan] = dev_idata[index];
}
}
}

/**
* Performs prefix-sum (aka scan) on idata, storing the result into odata.
*/
void scan(int n, int *odata, const int *idata) {

//used to round the array sizes to the next power of two.
int nCeil = ilog2ceil(n);
int n2PowCeil = 1 << nCeil;

int* dev_idata;
cudaMalloc((void**)&dev_idata, n2PowCeil * sizeof(int));
checkCUDAError("cudaMalloc dev_idata failed!");

cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
checkCUDAError("cudaMemcpy idata to dev_idata failed!");
//timer start
timer().startGpuTimer();
if (n2PowCeil != n) {
cudaMemset(&(dev_idata[n]), 0, (n2PowCeil - n) * sizeof(int));
checkCUDAError("cudaMemset failed!");
}

//open n threads is enough
// TODO
//up-sweep
int depth = ilog2ceil(n2PowCeil) - 1;
for (int d = 0; d <= depth; ++d) {
int threadNeeded = 1 << (nCeil - d - 1);
dim3 fullBlocksPerGrid((blockSize + threadNeeded - 1) / blockSize);
kernUpSweep << <fullBlocksPerGrid, blockSize>> > (threadNeeded, d, dev_idata);
}
//down-sweep
cudaMemset(&(dev_idata[n2PowCeil -1]), 0, sizeof(int));
for (int d = depth; d >= 0; --d) {
int threadNeeded = 1 << (nCeil - d - 1);
dim3 fullBlocksPerGrid((blockSize + threadNeeded - 1) / blockSize);
kernDownSweep << <fullBlocksPerGrid, blockSize >> > (threadNeeded, d, dev_idata);
}
timer().endGpuTimer();


cudaMemcpy(odata, dev_idata, n * sizeof(int), cudaMemcpyDeviceToHost);
checkCUDAError("memory dev_idata to odata failed!");

cudaFree(dev_idata);
}

/**
Expand All @@ -31,10 +114,68 @@ namespace StreamCompaction {
* @returns The number of elements remaining after compaction.
*/
int compact(int n, int *odata, const int *idata) {

int nCeil = ilog2ceil(n);
int n2PowCeil = 1 << nCeil;
int* dev_idata;
int* dev_tempArr;
int* dev_finalArr;

cudaMalloc((void**)&dev_idata, n2PowCeil * sizeof(int));
checkCUDAError("cudaMalloc dev_idata failed!");

cudaMalloc((void**)&dev_tempArr, n2PowCeil * sizeof(int));
checkCUDAError("cudaMalloc dev_tempArr failed!");

cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
checkCUDAError("cudaMemcpy from idata to dev_idata failed!");

//start
timer().startGpuTimer();
// TODO
if (n2PowCeil != n) {
cudaMemset(&(dev_idata[n]), 0, (n2PowCeil - n) * sizeof(int));
checkCUDAError("cudaMemset dev_idata failed!");
}

cudaMemset(dev_tempArr, 0, n2PowCeil * sizeof(int));
checkCUDAError("cudaMemset dev_tempArr failed!");

dim3 fullBlocksPerGrid((blockSize + n - 1) / blockSize);

// build boolean array
kernMapToBoolean << <fullBlocksPerGrid, blockSize >> > (n, dev_tempArr, dev_idata);
int lastElement = idata[n - 1];

//up-sweep
int depth = ilog2ceil(n2PowCeil) - 1;
for (int d = 0; d <= depth; ++d) {
int threadNeeded = 1 << (nCeil - d - 1);
dim3 fullBlocksPerGrid((blockSize + threadNeeded - 1) / blockSize);
kernUpSweep << <fullBlocksPerGrid, blockSize >> > (threadNeeded, d, dev_tempArr);
}
//create final array based on up-sweep result
int numOfResults;
cudaMemcpy(&numOfResults, &(dev_tempArr[n2PowCeil - 1]), sizeof(int), cudaMemcpyDeviceToHost);
cudaMalloc((void**)&dev_finalArr, numOfResults * sizeof(int));
//down-sweep
cudaMemset(&(dev_tempArr[n2PowCeil - 1]), 0, sizeof(int));
for (int d = depth; d >= 0; --d) {
int threadNeeded = 1 << (nCeil - d - 1);
dim3 fullBlocksPerGrid((blockSize + threadNeeded - 1) / blockSize);
kernDownSweep << <fullBlocksPerGrid, blockSize >> > (threadNeeded, d, dev_tempArr);
}
//scatter
kernScatter << <fullBlocksPerGrid, blockSize >> > (n, dev_tempArr, dev_finalArr, dev_idata);

timer().endGpuTimer();
return -1;
//end

cudaMemcpy(odata, dev_finalArr, numOfResults * sizeof(int), cudaMemcpyDeviceToHost);
if (lastElement) {
odata[numOfResults - 1] = lastElement;
}

return numOfResults;
}
}
}
Loading