CIS565-Fall-2018 · echiu · Sep 18, 2018 · Sep 19, 2018 · Sep 19, 2018 · Sep 19, 2018
diff --git a/README.md b/README.md
@@ -3,12 +3,90 @@ CUDA Stream Compaction
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Eric Chiu
+* Tested on: Windows 10 Education, Intel(R) Xeon(R) CPU E5-1630 v4 @ 3.60GHz 32GB, NVIDIA GeForce GTX 1070 (SIGLAB)
 
-### (TODO: Your README)
+## Description
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+This project explores and compares different CPU and GPU implementations of the scan (prefix sum) and the compaction algorithms. Some implementations include naive and work efficient methods.
 
+## Performance Analysis
+
+
+The following images below show a comparison between CPU, naive, work-efficient, and thrust implementations of the scan algorithm.
+
+
+![](./img/scan-data.png)
+
+![](./img/scan-chart.png)
+
+
+The following images below show a comparison between CPU without scan, CPU with scan, and work-efficient implementations of the compaction algorithm.
+
+
+![](./img/compaction-data.png)
+
+![](./img/compaction-chart.png)
+
+
+Overall, the thrust implementation had the best performance long term. In the beginning with smaller array sizes however, the CPU implementation was the fastest, then naive implementation, then work-efficient implementation, and then thrust implementation. As the array size increased, different implementations experienced different bottlenecks. I believe the CPU implementation experienced a bottleneck in the number of operations because it goes through serial memory processing (as to parallel memory processing). I believe the naive and work-efficient implementations experienced a bottleneck in global memory access. 
+
+
+## Program Output
+
+```
+****************
+** SCAN TESTS **
+****************
+    [  26  38   4  11  28  35   3   0   3   4  36   3  39 ...  22   0 ]
+==== cpu scan, power-of-two ====
+   elapsed time: 0.0006ms    (std::chrono Measured)
+    [   0  26  64  68  79 107 142 145 145 148 152 188 191 ... 6071 6093 ]
+==== cpu scan, non-power-of-two ====
+   elapsed time: 0.0007ms    (std::chrono Measured)
+    [   0  26  64  68  79 107 142 145 145 148 152 188 191 ... 6031 6046 ]
+    passed
+==== naive scan, power-of-two ====
+   elapsed time: 0.023552ms    (CUDA Measured)
+    passed
+==== naive scan, non-power-of-two ====
+   elapsed time: 0.023552ms    (CUDA Measured)
+    passed
+==== work-efficient scan, power-of-two ====
+   elapsed time: 0.095232ms    (CUDA Measured)
+    passed
+==== work-efficient scan, non-power-of-two ====
+   elapsed time: 0.105472ms    (CUDA Measured)
+    passed
+==== thrust scan, power-of-two ====
+   elapsed time: 0.058368ms    (CUDA Measured)
+    passed
+==== thrust scan, non-power-of-two ====
+   elapsed time: 0.05632ms    (CUDA Measured)
+    passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [   3   0   0   3   2   1   1   1   0   0   2   1   1 ...   2   0 ]
+==== cpu compact without scan, power-of-two ====
+   elapsed time: 0.0009ms    (std::chrono Measured)
+    [   3   3   2   1   1   1   2   1   1   3   2   1   2 ...   1   2 ]
+    passed
+==== cpu compact without scan, non-power-of-two ====
+   elapsed time: 0.0009ms    (std::chrono Measured)
+    [   3   3   2   1   1   1   2   1   1   3   2   1   2 ...   2   2 ]
+    passed
+==== cpu compact with scan ====
+   elapsed time: 0.0035ms    (std::chrono Measured)
+    [   3   3   2   1   1   1   2   1   1   3   2   1   2 ...   1   2 ]
+    passed
+==== work-efficient compact, power-of-two ====
+   elapsed time: 0.365568ms    (CUDA Measured)
+    passed
+==== work-efficient compact, non-power-of-two ====
+   elapsed time: 0.359424ms    (CUDA Measured)
+    passed
+Press any key to continue . . .
+
+```
diff --git a/img/compaction-chart.png b/img/compaction-chart.png
diff --git a/img/compaction-data.png b/img/compaction-data.png
diff --git a/img/scan-chart.png b/img/scan-chart.png
diff --git a/img/scan-data.png b/img/scan-data.png
diff --git a/src/main.cpp b/src/main.cpp
@@ -13,7 +13,7 @@
 #include <stream_compaction/thrust.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 8; // feel free to change the size of array
+const int SIZE = 256; // feel free to change the size of array
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
 int *a = new int[SIZE];
 int *b = new int[SIZE];

diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt
@@ -13,5 +13,5 @@ set(SOURCE_FILES
 
 cuda_add_library(stream_compaction
     ${SOURCE_FILES}
-    OPTIONS -arch=sm_20
+    OPTIONS -arch=sm_61
     )
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
@@ -22,17 +22,26 @@ namespace StreamCompaction {
          * Maps an array to an array of 0s and 1s for stream compaction. Elements
          * which map to 0 will be removed, and elements which map to 1 will be kept.
          */
-        __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
-            // TODO
+        __global__ void kernMapToBoolean(int n, int *bools, const int *idata) 
+        {
+            int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+            if (index >= n) return;
+            bools[index] = idata[index] == 0 ? 0 : 1;
         }
 
         /**
          * Performs scatter on an array. That is, for each element in idata,
          * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]].
          */
         __global__ void kernScatter(int n, int *odata,
-                const int *idata, const int *bools, const int *indices) {
-            // TODO
+                const int *idata, const int *bools, const int *indices) 
+        {
+            int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+            if (index >= n) return;
+            if (bools[index] == 1)
+            {
+                odata[indices[index]] = idata[index];
+            }
         }
 
     }

diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
@@ -17,9 +17,25 @@ namespace StreamCompaction {
          * For performance analysis, this is supposed to be a simple for loop.
          * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first.
          */
-        void scan(int n, int *odata, const int *idata) {
+
+        void scanHelper(int n, int *odata, const int *idata)
+        {
+            if (n == 0) return;
+
+            odata[0] = 0;
+            for (int i = 1; i < n; i++)
+            {
+                odata[i] = odata[i - 1] + idata[i - 1];
+            }
+
+        }
+
+        void scan(int n, int *odata, const int *idata) 
+        {
 	        timer().startCpuTimer();
-            // TODO
+
+            scanHelper(n, odata, idata);
+
 	        timer().endCpuTimer();
         }
 
@@ -29,22 +45,62 @@ namespace StreamCompaction {
          * @returns the number of elements remaining after compaction.
          */
         int compactWithoutScan(int n, int *odata, const int *idata) {
-	        timer().startCpuTimer();
-            // TODO
+
+            timer().startCpuTimer();
+
+            int oIndex = 0;
+            for (int i = 0; i < n; i++)
+            {
+                if (idata[i] != 0)
+                {
+                    odata[oIndex] = idata[i];
+                    oIndex++;
+                }
+            }
+
 	        timer().endCpuTimer();
-            return -1;
+            return oIndex;
         }
 
         /**
          * CPU stream compaction using scan and scatter, like the parallel version.
          *
          * @returns the number of elements remaining after compaction.
          */
-        int compactWithScan(int n, int *odata, const int *idata) {
-	        timer().startCpuTimer();
-	        // TODO
-	        timer().endCpuTimer();
-            return -1;
+        int compactWithScan(int n, int *odata, const int *idata)
+        {
+            timer().startCpuTimer();
+
+            int* bdata = new int[n];
+            int* sdata = new int[n];
+
+            for (int i = 0; i < n; i++)
+            {
+                if (idata[i] != 0)
+                {
+                    bdata[i] = 1;
+                }
+                else
+                {
+                    bdata[i] = 0;
+                }
+            }
+
+            // cannot call scan because it uses startCpuTimer as well
+            scanHelper(n, sdata, bdata);
+
+            int sum = 0;
+            for (int i = 0; i < n; i++)
+            {
+                if (bdata[i] != 0)
+                {
+                    odata[sdata[i]] = idata[i];
+                    sum = sdata[i];
+                }
+            }
+
+            timer().endCpuTimer();
+            return sum + 1;
         }
     }
 }