CIS565-Fall-2022 · Furinner · Oct 12, 2022 · Oct 12, 2022 · Oct 12, 2022
diff --git a/README.md b/README.md
@@ -3,12 +3,77 @@ CUDA Stream Compaction
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* XiaoyuDu
+* Tested on: Windows 10, i9-11900KF @ 3.50GHz, RTX 3080 (Personal PC)
+
+### Description  
+This project tested for different method of scan and compact.
+
+### Feature  
+I implemented all the features for part 1 - 5.
+* CPU Scan & Stream Compaction
+* Naive GPU Scan Algorithm
+* Work-Efficient GPU Scan & Stream Compaction
+* Thrust's Implementation
+* GPU Work-Efficient Method Optimization
+
+### Performance Analysis  
+My optimized number of blocks is 128.  
+I campared different method with different size array, and the result plot is shown below. I am a bit confused why my Thrust implementation takes so long to run. I think my implementation should be correct.  
+![](./images/1.png)  
 
-### (TODO: Your README)
-
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+Below is the test result with 2^20 size array.  
+```
+****************
+** SCAN TESTS **
+****************
+    [  28   9  12  41  33  49  46   3  11  27  35   5  47 ...   8   0 ]
+==== cpu scan, power-of-two ====
+   elapsed time: 1.7669ms    (std::chrono Measured)
+    [   0  28  37  49  90 123 172 218 221 232 259 294 299 ... 25674595 25674603 ]
+==== cpu scan, non-power-of-two ====
+   elapsed time: 1.7544ms    (std::chrono Measured)
+    [   0  28  37  49  90 123 172 218 221 232 259 294 299 ... 25674502 25674539 ]
+    passed
+==== naive scan, power-of-two ====
+   elapsed time: 0.510176ms    (CUDA Measured)
+    passed
+==== naive scan, non-power-of-two ====
+   elapsed time: 0.695424ms    (CUDA Measured)
+    passed
+==== work-efficient scan, power-of-two ====
+   elapsed time: 0.43328ms    (CUDA Measured)
+    passed
+==== work-efficient scan, non-power-of-two ====
+   elapsed time: 0.631104ms    (CUDA Measured)
+    passed
+==== thrust scan, power-of-two ====
+   elapsed time: 28.3783ms    (CUDA Measured)
+    passed
+==== thrust scan, non-power-of-two ====
+   elapsed time: 7.89008ms    (CUDA Measured)
+    passed
 
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [   1   2   1   1   1   2   0   2   2   3   0   3   3 ...   3   0 ]
+==== cpu compact without scan, power-of-two ====
+   elapsed time: 3.7818ms    (std::chrono Measured)
+    [   1   2   1   1   1   2   2   2   3   3   3   2   3 ...   3   3 ]
+    passed
+==== cpu compact without scan, non-power-of-two ====
+   elapsed time: 3.5542ms    (std::chrono Measured)
+    [   1   2   1   1   1   2   2   2   3   3   3   2   3 ...   3   3 ]
+    passed
+==== cpu compact with scan ====
+   elapsed time: 9.8808ms    (std::chrono Measured)
+    [   1   2   1   1   1   2   2   2   3   3   3   2   3 ...   3   3 ]
+    passed
+==== work-efficient compact, power-of-two ====
+   elapsed time: 1.07133ms    (CUDA Measured)
+    passed
+==== work-efficient compact, non-power-of-two ====
+   elapsed time: 0.849984ms    (CUDA Measured)
+    passed
+```
diff --git a/images/1.png b/images/1.png
diff --git a/src/main.cpp b/src/main.cpp
@@ -13,7 +13,7 @@
 #include <stream_compaction/thrust.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 8; // feel free to change the size of array
+const int SIZE = 1 << 20; // feel free to change the size of array
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
 int *a = new int[SIZE];
 int *b = new int[SIZE];
@@ -139,13 +139,16 @@ int main(int argc, char* argv[]) {
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
     //printArray(count, c, true);
     printCmpLenResult(count, expectedCount, b, c);
+
+
 
     zeroArray(SIZE, c);
     printDesc("work-efficient compact, non-power-of-two");
     count = StreamCompaction::Efficient::compact(NPOT, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
     //printArray(count, c, true);
     printCmpLenResult(count, expectedNPOT, b, c);
+
 
     system("pause"); // stop Win32 console from closing on exit
     delete[] a;

diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
@@ -20,6 +20,10 @@ namespace StreamCompaction {
         void scan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
             // TODO
+            odata[0] = 0;
+            for (int i = 1; i < n; ++i) {
+                odata[i] = odata[i - 1] + idata[i - 1];
+            }
             timer().endCpuTimer();
         }
 
@@ -31,8 +35,15 @@ namespace StreamCompaction {
         int compactWithoutScan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
             // TODO
+            int num = 0;
+            for (int i = 0; i < n; ++i) {
+                if (idata[i] != 0) {
+                    odata[num] = idata[i];
+                    ++num;
+                }
+            }
             timer().endCpuTimer();
-            return -1;
+            return num;
         }
 
         /**
@@ -43,8 +54,40 @@ namespace StreamCompaction {
         int compactWithScan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
             // TODO
+            int* boolArr = new int[n];
+            int* scanArr = new int[n];
+            int num = 0;
+
+            //build boolArr
+            for (int i = 0; i < n; ++i) {
+                if (idata[i] != 0) {
+                    boolArr[i] = 1;
+                }
+                else {
+                    boolArr[i] = 0;
+                }
+            }
+
+            //build scanArr
+            scanArr[0] = 0;
+            for (int i = 1; i < n; ++i) {
+                scanArr[i] = boolArr[i - 1] + scanArr[i - 1];
+            }
+
+            //fill odata
+            for (int i = 0; i < n; ++i) {
+                if (boolArr[i] == 1) {
+                    odata[scanArr[i]] = idata[i];
+                }
+            }
+
+            //calculate num to return
+            num = scanArr[n - 1] + boolArr[n - 1];
+
+            delete[] boolArr;
+            delete[] scanArr;
             timer().endCpuTimer();
-            return -1;
+            return num;
         }
     }
 }
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
@@ -3,6 +3,8 @@
 #include "common.h"
 #include "efficient.h"
 
+#define blockSize 128
+
 namespace StreamCompaction {
     namespace Efficient {
         using StreamCompaction::Common::PerformanceTimer;
@@ -12,13 +14,94 @@ namespace StreamCompaction {
             return timer;
         }
 
+        __global__ void kernUpSweep(int threadNeeded, int d, int* dev_idata) {
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
+            //increase1 2^(d+1), increase2 2^d
+            if (index < threadNeeded) {
+                int increase1 = 1 << (d + 1);
+                int increase2 = 1 << d;
+                int multiIdx = index * increase1;
+                dev_idata[multiIdx + increase1 - 1] += dev_idata[multiIdx + increase2 - 1];
+            }
+        }
+
+        __global__ void kernDownSweep(int threadNeeded, int d, int* dev_idata){
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (index < threadNeeded) {
+                int increase1 = 1 << (d + 1);
+                int increase2 = 1 << d;
+                int multiIdx = index * increase1;
+                int t = dev_idata[multiIdx + increase2 - 1];
+                dev_idata[multiIdx + increase2 - 1] = dev_idata[multiIdx + increase1 - 1];
+                dev_idata[multiIdx + increase1 - 1] += t;
+            }
+        }
+
+        __global__ void kernMapToBoolean(int n, int* temp_Arr, int* dev_idata) {
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (index < n) {
+                if (dev_idata[index] != 0) {
+                    temp_Arr[index] = 1;
+                }
+            }
+        }
+
+        __global__ void kernScatter(int n, int* dev_tempArr, int* dev_finalArr, int* dev_idata) {
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (index < (n - 1)) {
+                int currScan = dev_tempArr[index];
+                int nextScan = dev_tempArr[index + 1];
+                if (currScan < nextScan) {
+                    dev_finalArr[currScan] = dev_idata[index];
+                }
+            }
+        }
+
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+
+            //used to round the array sizes to the next power of two.
+            int nCeil = ilog2ceil(n);
+            int n2PowCeil = 1 << nCeil;
+
+            int* dev_idata;
+            cudaMalloc((void**)&dev_idata, n2PowCeil * sizeof(int));
+            checkCUDAError("cudaMalloc dev_idata failed!");
+
+            cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+            checkCUDAError("cudaMemcpy idata to dev_idata failed!");
+            //timer start
             timer().startGpuTimer();
+            if (n2PowCeil != n) {
+                cudaMemset(&(dev_idata[n]), 0, (n2PowCeil - n) * sizeof(int));
+                checkCUDAError("cudaMemset failed!");
+            }
+
+            //open n threads is enough
             // TODO
+            //up-sweep
+            int depth = ilog2ceil(n2PowCeil) - 1;
+            for (int d = 0; d <= depth; ++d) {
+                int threadNeeded = 1 << (nCeil - d - 1);
+                dim3 fullBlocksPerGrid((blockSize + threadNeeded - 1) / blockSize);
+                kernUpSweep << <fullBlocksPerGrid, blockSize>> > (threadNeeded, d, dev_idata);
+            }
+            //down-sweep
+            cudaMemset(&(dev_idata[n2PowCeil -1]), 0, sizeof(int));
+            for (int d = depth; d >= 0; --d) {
+                int threadNeeded = 1 << (nCeil - d - 1);
+                dim3 fullBlocksPerGrid((blockSize + threadNeeded - 1) / blockSize);
+                kernDownSweep << <fullBlocksPerGrid, blockSize >> > (threadNeeded, d, dev_idata);
+            }
             timer().endGpuTimer();
+
+
+            cudaMemcpy(odata, dev_idata, n * sizeof(int), cudaMemcpyDeviceToHost);
+            checkCUDAError("memory dev_idata to odata failed!");
+
+            cudaFree(dev_idata);
         }
 
         /**
@@ -31,10 +114,68 @@ namespace StreamCompaction {
          * @returns      The number of elements remaining after compaction.
          */
         int compact(int n, int *odata, const int *idata) {
+
+            int nCeil = ilog2ceil(n);
+            int n2PowCeil = 1 << nCeil;
+            int* dev_idata;
+            int* dev_tempArr;
+            int* dev_finalArr;
+
+            cudaMalloc((void**)&dev_idata, n2PowCeil * sizeof(int));
+            checkCUDAError("cudaMalloc dev_idata failed!");
+
+            cudaMalloc((void**)&dev_tempArr, n2PowCeil * sizeof(int));
+            checkCUDAError("cudaMalloc dev_tempArr failed!");
+
+            cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+            checkCUDAError("cudaMemcpy from idata to dev_idata failed!");
+
+            //start
             timer().startGpuTimer();
-            // TODO
+            if (n2PowCeil != n) {
+                cudaMemset(&(dev_idata[n]), 0, (n2PowCeil - n) * sizeof(int));
+                checkCUDAError("cudaMemset dev_idata failed!");
+            }
+
+            cudaMemset(dev_tempArr, 0, n2PowCeil * sizeof(int));
+            checkCUDAError("cudaMemset dev_tempArr failed!");
+
+            dim3 fullBlocksPerGrid((blockSize + n - 1) / blockSize);
+
+            // build boolean array
+            kernMapToBoolean << <fullBlocksPerGrid, blockSize >> > (n, dev_tempArr, dev_idata);
+            int lastElement = idata[n - 1];
+
+            //up-sweep
+            int depth = ilog2ceil(n2PowCeil) - 1;
+            for (int d = 0; d <= depth; ++d) {
+                int threadNeeded = 1 << (nCeil - d - 1);
+                dim3 fullBlocksPerGrid((blockSize + threadNeeded - 1) / blockSize);
+                kernUpSweep << <fullBlocksPerGrid, blockSize >> > (threadNeeded, d, dev_tempArr);
+            }
+            //create final array based on up-sweep result
+            int numOfResults;
+            cudaMemcpy(&numOfResults, &(dev_tempArr[n2PowCeil - 1]), sizeof(int), cudaMemcpyDeviceToHost);
+            cudaMalloc((void**)&dev_finalArr, numOfResults * sizeof(int));
+            //down-sweep
+            cudaMemset(&(dev_tempArr[n2PowCeil - 1]), 0, sizeof(int));
+            for (int d = depth; d >= 0; --d) {
+                int threadNeeded = 1 << (nCeil - d - 1);
+                dim3 fullBlocksPerGrid((blockSize + threadNeeded - 1) / blockSize);
+                kernDownSweep << <fullBlocksPerGrid, blockSize >> > (threadNeeded, d, dev_tempArr);
+            }
+            //scatter
+            kernScatter << <fullBlocksPerGrid, blockSize >> > (n, dev_tempArr, dev_finalArr, dev_idata);
+
             timer().endGpuTimer();
-            return -1;
+            //end
+
+            cudaMemcpy(odata, dev_finalArr, numOfResults * sizeof(int), cudaMemcpyDeviceToHost);
+            if (lastElement) {
+                odata[numOfResults - 1] = lastElement;
+            }
+
+            return numOfResults;
         }
     }
 }