diff --git a/README.md b/README.md
index 0e38ddb..614e061 100644
--- a/README.md
+++ b/README.md
@@ -3,12 +3,41 @@ CUDA Stream Compaction
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Zirui Zang
+  * [LinkedIn](https://www.linkedin.com/in/zirui-zang/)
+* Tested on: Windows 10, AMD Ryzen 7 3700X @ 3.60GHz 32GB, RTX2070 SUPER 8GB (Personal)
+* I'm using one late day for this project. 
 
-### (TODO: Your README)
+### Results
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+In this lab I have implemented the scan (prefix-sum) and stream compaction alogrithm in three different ways:
 
+1. CPU sequential method
+2. GPU naive method, which parallelized the addition but has to iterate log(n) times.
+3. GPU efficient method, which parallelized the whole process.
+
+Additional attention has been given to utilizing shared memory in kernel computation to reduce global memory access. The array that needs to be operate on is pre-fetch into shared memory to scan operations can run within shared memory.
+
+The code is also implementation so large input array can be split into GPU blocksizes and merged after computation.
+
+Here are the charts of my implementations:
+![chart_scan](img/chart_scan.png)
+
+![chart_stream](img/chart_stream.png)
+
+Here are the results of my implementations:
+![scan](img/scan.png)
+
+![string](img/string.png)
+
+Here are some issues with the code:
+1. The efficient method is not efficient at all.
+
+### Discussions
+
+The efficient method is not so efficient mainly due to complex kernel operations and excessive memory operations. In the upper and down sweep more than half of the threads are idle. This creates a lot of CUDA core occupation and very scattering memory access. If the numbers in the array can be more comgregated, we can launch fewer threads and save more time in seqential memory access.
+
+
+### VS Studio useless machine
+![VS studio](img/vss.gif)
+This is probably due to the autosync of my cloud.
diff --git a/img/chart_scan.png b/img/chart_scan.png
new file mode 100644
index 0000000..d4e1e88
Binary files /dev/null and b/img/chart_scan.png differ
diff --git a/img/chart_stream.png b/img/chart_stream.png
new file mode 100644
index 0000000..393eb86
Binary files /dev/null and b/img/chart_stream.png differ
diff --git a/img/scan.png b/img/scan.png
new file mode 100644
index 0000000..671c0c7
Binary files /dev/null and b/img/scan.png differ
diff --git a/img/string.png b/img/string.png
new file mode 100644
index 0000000..79371f4
Binary files /dev/null and b/img/string.png differ
diff --git a/img/vss.gif b/img/vss.gif
new file mode 100644
index 0000000..d720b0b
Binary files /dev/null and b/img/vss.gif differ
diff --git a/src/main.cpp b/src/main.cpp
index 896ac2b..678961a 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -13,7 +13,7 @@
 #include <stream_compaction/thrust.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 8; // feel free to change the size of array
+const int SIZE = 1 << 18; // feel free to change the size of array
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
 int *a = new int[SIZE];
 int *b = new int[SIZE];
@@ -26,8 +26,10 @@ int main(int argc, char* argv[]) {
     printf("****************\n");
     printf("** SCAN TESTS **\n");
     printf("****************\n");
+    printf("%d ", SIZE);
 
     genArray(SIZE - 1, a, 50);  // Leave a 0 at the end to test that edge case
+    //onesArray(SIZE - 1, a);
     a[SIZE - 1] = 0;
     printArray(SIZE, a, true);
 
@@ -51,48 +53,48 @@ int main(int argc, char* argv[]) {
     printDesc("naive scan, power-of-two");
     StreamCompaction::Naive::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
-    /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
-    onesArray(SIZE, c);
-    printDesc("1s array for finding bugs");
-    StreamCompaction::Naive::scan(SIZE, c, a);
-    printArray(SIZE, c, true); */
+    /////* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
+    ////onesArray(SIZE, c);
+    ////printDesc("1s array for finding bugs");
+    ////StreamCompaction::Naive::scan(SIZE, c, a);
+    ////printArray(SIZE, c, true); */
 
     zeroArray(SIZE, c);
     printDesc("naive scan, non-power-of-two");
     StreamCompaction::Naive::scan(NPOT, c, a);
     printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(NPOT, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient scan, power-of-two");
     StreamCompaction::Efficient::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient scan, non-power-of-two");
     StreamCompaction::Efficient::scan(NPOT, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(NPOT, c, true);
+    printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
 
     zeroArray(SIZE, c);
     printDesc("thrust scan, power-of-two");
     StreamCompaction::Thrust::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);
     printDesc("thrust scan, non-power-of-two");
     StreamCompaction::Thrust::scan(NPOT, c, a);
     printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(NPOT, c, true);
+    printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
 
     printf("\n");
@@ -137,14 +139,14 @@ int main(int argc, char* argv[]) {
     printDesc("work-efficient compact, power-of-two");
     count = StreamCompaction::Efficient::compact(SIZE, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
+    printArray(count, c, true);
     printCmpLenResult(count, expectedCount, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient compact, non-power-of-two");
     count = StreamCompaction::Efficient::compact(NPOT, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
+    printArray(count, c, true);
     printCmpLenResult(count, expectedNPOT, b, c);
 
     system("pause"); // stop Win32 console from closing on exit
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
index 2ed6d63..386af2b 100644
--- a/stream_compaction/common.cu
+++ b/stream_compaction/common.cu
@@ -23,7 +23,16 @@ namespace StreamCompaction {
          * which map to 0 will be removed, and elements which map to 1 will be kept.
          */
         __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
-            // TODO
+            int tx = threadIdx.x;
+            if (tx >= n) {
+                return;
+            }
+            if (idata[tx] > 0) {
+                bools[tx] = 1;
+            }
+            else {
+                bools[tx] = 0;
+            }
         }
 
         /**
@@ -32,8 +41,13 @@ namespace StreamCompaction {
          */
         __global__ void kernScatter(int n, int *odata,
                 const int *idata, const int *bools, const int *indices) {
-            // TODO
+            int tx = threadIdx.x;
+            if (tx >= n) {
+                return;
+            }
+            if (bools[tx] == 1) {
+                odata[indices[tx]] = idata[tx];
+            }
         }
-
     }
 }
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index 719fa11..898919a 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -18,9 +18,37 @@ namespace StreamCompaction {
          * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first.
          */
         void scan(int n, int *odata, const int *idata) {
+
+            //for (int ind = 0; ind < n / 2; ind++) {
+            //    printf("%d ", idata[ind]);
+            //}
+            //printf("\n");
+            //printf("\n");
+            //for (int ind = n / 2; ind < n; ind++) {
+            //    printf("%d ", idata[ind]);
+            //}
+            //printf("\n");
+            //printf("\n");
+            
+
             timer().startCpuTimer();
-            // TODO
+            odata[0] = 0;
+            for (int ind = 1; ind < n; ind++) {
+                odata[ind] = idata[ind-1] + odata[ind - 1];
+            }
+
             timer().endCpuTimer();
+
+            //for (int ind = 0; ind < n / 2; ind++) {
+            //    printf("%d ", odata[ind]);
+            //}
+            //printf("\n");
+            //printf("\n");
+            //for (int ind = n / 2; ind < n; ind++) {
+            //    printf("%d ", odata[ind]);
+            //}
+            //printf("\n");
+            //printf("\n");
         }
 
         /**
@@ -30,9 +58,17 @@ namespace StreamCompaction {
          */
         int compactWithoutScan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
-            // TODO
+            int count = 0;
+            for (int ind = 0; ind < n; ind++) {
+                if (idata[ind] != 0) {
+                    odata[count] = idata[ind];
+                    count++;
+                }
+            }
             timer().endCpuTimer();
-            return -1;
+            return count;
+
+            
         }
 
         /**
@@ -42,9 +78,36 @@ namespace StreamCompaction {
          */
         int compactWithScan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
-            // TODO
-            timer().endCpuTimer();
-            return -1;
+            
+            int* bool_list = new int[n];
+            int* scan_list = new int[n];
+            for (int ind = 0; ind < n; ind++) {
+                if (idata[ind] == 0) {
+                    bool_list[ind] = 0;
+                }
+                else {
+                    bool_list[ind] = 1;
+                }
+                //printf("%d ", bool_list[ind]);
+            }
+            //printf("\n");
+            scan_list[0] = 0;
+            //printf("%d ", scan_list[0]);
+            for (int ind = 1; ind < n; ind++) {
+                scan_list[ind] = bool_list[ind - 1] + scan_list[ind - 1];
+                //printf("%d ", scan_list[ind]);
+            }
+            //printf("\n");
+            int count = 0;
+            for (int ind = 0; ind < n; ind++) {
+                //printf("%d ", idata[ind]);
+                if (bool_list[ind] == 1) {
+                    odata[scan_list[ind]] = idata[ind];
+                    count++;
+                }
+            }
+
+            return count;
         }
     }
 }
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 2db346e..a11f7db 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -3,6 +3,8 @@
 #include "common.h"
 #include "efficient.h"
 
+#define blockSize 1024
+
 namespace StreamCompaction {
     namespace Efficient {
         using StreamCompaction::Common::PerformanceTimer;
@@ -12,13 +14,227 @@ namespace StreamCompaction {
             return timer;
         }
 
+        int* dev_array;
+        int* dev_array_static;
+
+        int* dev_idata;
+        int* dev_odata;
+        int* dev_bools;
+        int* dev_indices;
+        
+
+        //__global__ void kernReduction_1st_attempt(
+        //    int array_length, int sum_ind_diff, int start_ind, int stride,
+        //    int* array) {
+        //    // compute one layer of scan in parallel.
+        //    int index = threadIdx.x + (blockIdx.x * blockDim.x);
+        //    if (index * stride + sum_ind_diff + start_ind >= array_length) {
+        //        return;
+        //    }
+        //    array[index * stride + sum_ind_diff + start_ind] = array[index * stride + start_ind] + array[index * stride + sum_ind_diff + start_ind];
+        //    __syncthreads();
+        //}
+
+        //__global__ void kernScanFromReduction_1st_attempt(
+        //    int array_length, int sum_ind_diff, int start_ind, int stride,
+        //    int* array) {
+        //    int index = threadIdx.x + (blockIdx.x * blockDim.x);
+        //    if (array_length-1 - index * stride - sum_ind_diff - start_ind < 0) {
+        //        return;
+        //    }
+        //    int left_child = array[array_length - 1 - index * stride - sum_ind_diff];
+        //    array[array_length - 1 - index * stride - sum_ind_diff] = array[array_length - 1 - index * stride];
+        //    array[array_length - 1 - index * stride] = array[array_length - 1 - index * stride] + left_child;
+
+        //}
+
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
-        void scan(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
-            // TODO
-            timer().endGpuTimer();
+        //void scan_1st_attempt(int n, int* odata, const int* idata) {
+        //    // deal with non-2-power input
+        //    int depth = ilog2ceil(n);
+        //    int array_length = pow(2, depth);
+        //    if (ilog2(n) != depth) {
+        //        int* new_idata = new int[array_length];
+        //        memset(new_idata, 0, array_length * sizeof(int));
+        //        memcpy(new_idata, idata, n * sizeof(int));
+        //        idata = new_idata;
+        //    }
+        //    cudaMalloc((void**)&dev_array, array_length * sizeof(int));
+        //    cudaMemcpy(dev_array, idata, array_length * sizeof(int), cudaMemcpyHostToDevice);
+
+        //    timer().startGpuTimer();
+        //    dim3 fullBlocksPerGrid((array_length + blockSize - 1) / blockSize);
+        //    for (int depth_ind = 0; depth_ind <= depth - 1; depth_ind++) {
+        //        int sum_ind_diff = pow(2, depth_ind);
+        //        int start_ind = sum_ind_diff - 1;
+        //        int stride = pow(2, depth_ind + 1);
+        //        kernReduction_1st_attempt << <fullBlocksPerGrid, blockSize >> > (array_length, sum_ind_diff, start_ind, stride, dev_array);
+        //    }
+        //    cudaDeviceSynchronize();
+
+        //    cudaMemset(dev_array + array_length - 1, 0, sizeof(int));
+        //    for (int depth_ind = depth - 1; depth_ind >=0 ; depth_ind--) {
+        //        int sum_ind_diff = pow(2, depth_ind);
+        //        int start_ind = sum_ind_diff - 1;
+        //        int stride = pow(2, depth_ind + 1);
+        //        kernScanFromReduction_1st_attempt << <fullBlocksPerGrid, blockSize >> > (array_length, sum_ind_diff, start_ind, stride, dev_array);
+        //    }
+        //    timer().endGpuTimer();
+        //    cudaMemcpy(odata, dev_array, array_length * sizeof(int), cudaMemcpyDeviceToHost);
+
+        //    //for (int ind = 0; ind < array_length; ind++) {
+        //    //    printf("%d ", odata[ind]);
+        //    //}
+        //    //printf("\n");
+        //    //printf("\n");
+        //}
+
+        __global__ void kernReduction(
+            //int array_length, int start_ind, int* array, int* array_static) {
+            int array_length, int* array, int* array_static) {
+            // compute scan in parallel.
+            __shared__ int share_array[blockSize];
+            int tx = threadIdx.x;
+            if (tx >= array_length) {
+                return;
+            }
+            //share_array[tx] = array_static[start_ind + tx];
+            share_array[tx] = array_static[tx + (blockIdx.x * blockDim.x)];
+            __syncthreads();
+            for (int stride = 1; stride < blockDim.x; stride *= 2) {
+                if (tx % (2 * stride) == (2 * stride) - 1) {
+                    share_array[tx] += share_array[tx - stride];
+                }
+                __syncthreads();
+            }
+            array[tx + (blockIdx.x * blockDim.x)] = share_array[tx];
+        }
+
+
+        __global__ void kernScanFromReduction(
+            int array_length, int depth, int* array, int* array_static) {
+            __shared__ int share_array[blockSize];
+            int tx = threadIdx.x;
+            if (tx >= array_length) {
+                return;
+            }
+            if (tx == blockSize - 1) {
+                share_array[tx] = 0;
+            }
+            else {
+                share_array[tx] = array_static[tx + (blockIdx.x * blockDim.x)];
+            }
+            __syncthreads();
+            for (int depth_ind = depth - 1; depth_ind >= 0; depth_ind--) {
+                int stride = pow(2, depth_ind);
+                if (tx % (2 * stride) == (2 * stride) - 1) {
+                    int left_child = share_array[tx - stride];
+                    share_array[tx - stride] = share_array[tx];
+                    share_array[tx] += left_child;
+                }
+                __syncthreads();
+            }
+            // convert result to inclusive
+            if (tx != blockSize - 1) {
+                array[tx + (blockIdx.x * blockDim.x)] = share_array[tx + 1];
+            }
+            __syncthreads();
+        }
+
+        __global__ void kernAdd(
+            int array_length, int block_ind, int* array_static, int* array) {
+            int tx = threadIdx.x + ((blockIdx.x + block_ind) * blockDim.x);
+            __shared__ int value;
+            value = array_static[block_ind * blockDim.x - 1];
+            if (tx >= array_length) {
+                return;
+            }
+            array[tx] += value;
+        }
+
+        void scan(int n, int* odata, const int* idata, bool timer_on) {
+            int depth = ilog2ceil(n);
+            int array_length = pow(2, depth);
+            if (ilog2(n) != depth) {
+                int* new_idata = new int[array_length];
+                memset(new_idata, 0, array_length * sizeof(int));
+                memcpy(new_idata, idata, n * sizeof(int));
+                idata = new_idata;
+            }
+            cudaMalloc((void**)&dev_array, array_length * sizeof(int));
+            cudaMemcpy(dev_array, idata, array_length * sizeof(int), cudaMemcpyHostToDevice);
+            cudaMalloc((void**)&dev_array_static, array_length * sizeof(int));
+            cudaMemcpy(dev_array_static, dev_array, array_length * sizeof(int), cudaMemcpyHostToDevice);
+            dim3 fullBlocksPerGrid((array_length + blockSize - 1) / blockSize);
+            //dim3 fullBlocksPerGrid(1);
+
+            int num_block;
+            if (array_length < blockSize) {
+                num_block = 1;
+            }
+            else {
+                num_block = array_length / blockSize;
+            }
+
+            if (timer_on) {
+                timer().startGpuTimer();
+            }
+
+            kernReduction << <fullBlocksPerGrid, blockSize >> > (array_length, dev_array, dev_array_static);
+            cudaMemcpy(dev_array_static, dev_array, array_length * sizeof(int), cudaMemcpyHostToDevice);
+
+            kernScanFromReduction << <fullBlocksPerGrid, blockSize >> > (array_length, depth, dev_array, dev_array_static);
+            cudaMemcpy(dev_array_static, dev_array, array_length * sizeof(int), cudaMemcpyHostToDevice);
+;
+            for (int block_ind = 1; block_ind < num_block; block_ind++) {
+                kernAdd << <fullBlocksPerGrid, blockSize >> > (array_length, block_ind, dev_array_static, dev_array);
+                cudaDeviceSynchronize();
+            }
+            if (timer_on) {
+                timer().endGpuTimer();
+            }
+            cudaMemcpy(odata + 1, dev_array, (array_length - 1) * sizeof(int), cudaMemcpyDeviceToHost);
+            odata[0] = 0;
+            //printf("\n");
+            //printf("\n");
+
+            //for (int ind = 0; ind < array_length / 2; ind++) {
+            //    printf("%d ", odata[ind]);
+            //}
+            //printf("\n");
+            //printf("\n");
+
+            //for (int ind = array_length / 2; ind < array_length; ind++) {
+            //    printf("%d ", odata[ind]);
+            //}
+            //printf("\n");
+            //printf("\n");
+
+            //for (int ind = 0; ind < array_length/4; ind++) {
+            //    printf("%d ", odata[ind]);
+            //}
+            //printf("\n");
+            //printf("\n");
+
+            //for (int ind = array_length / 4; ind < array_length/2; ind++) {
+            //    printf("%d ", odata[ind]);
+            //}
+            //printf("\n");
+            //printf("\n");
+
+            //for (int ind = array_length / 2; ind < array_length / 4 * 3; ind++) {
+            //    printf("%d ", odata[ind]);
+            //}
+            //printf("\n");
+            //printf("\n");
+
+            //for (int ind = array_length / 4 * 3; ind < array_length; ind++) {
+            //    printf("%d ", odata[ind]);
+            //}
+            //printf("\n");
+            //printf("\n");
         }
 
         /**
@@ -30,11 +246,57 @@ namespace StreamCompaction {
          * @param idata  The array of elements to compact.
          * @returns      The number of elements remaining after compaction.
          */
-        int compact(int n, int *odata, const int *idata) {
+        int compact(int n, int* odata, const int* idata) {
+            int depth = ilog2ceil(n);
+            int array_length = pow(2, depth);
+            if (ilog2(n) != depth) {
+                int* new_idata = new int[array_length];
+                memset(new_idata, 0, array_length * sizeof(int));
+                memcpy(new_idata, idata, n * sizeof(int));
+                idata = new_idata;
+            }
+            cudaMalloc((void**)&dev_bools, array_length * sizeof(int));
+            cudaMalloc((void**)&dev_indices, array_length * sizeof(int));
+            cudaMalloc((void**)&dev_idata, array_length * sizeof(int));
+            cudaMalloc((void**)&dev_odata, array_length * sizeof(int));
+            cudaMemcpy(dev_idata, idata, array_length * sizeof(int), cudaMemcpyHostToDevice);
+            dim3 fullBlocksPerGrid((array_length + blockSize - 1) / blockSize);
+            int count = 0;
+            int* host_bools = (int*) malloc(array_length * sizeof(int));
+            int num_block;
+            if (array_length < blockSize) {
+                num_block = 1;
+            }
+            else {
+                num_block = array_length / blockSize;
+            }
+
             timer().startGpuTimer();
-            // TODO
+            for (int block_ind = 0; block_ind < num_block; block_ind++) {
+                int start_ind = block_ind * blockSize;
+                Common::kernMapToBoolean << <fullBlocksPerGrid, blockSize >> > (array_length, dev_bools + start_ind, dev_idata + start_ind);
+            }
+            cudaDeviceSynchronize();
+
+            cudaMemcpy(host_bools, dev_bools, array_length * sizeof(int), cudaMemcpyDeviceToHost);
+            Efficient::scan(array_length, odata, host_bools, false);
+            //for (int ind = 0; ind < array_length; ind++) {
+            //    printf("%d ", host_bools[ind]);
+            //}
+            //printf("\n");
+            //printf("\n");
+            cudaMemcpy(dev_indices, odata, array_length * sizeof(int), cudaMemcpyHostToDevice);
+
+            for (int block_ind = 0; block_ind < num_block; block_ind++) {
+                int start_ind = block_ind * blockSize;
+                Common::kernScatter << <fullBlocksPerGrid, blockSize >> > (array_length, dev_odata, dev_idata + start_ind, dev_bools + start_ind, dev_indices + start_ind);
+            }
+            cudaDeviceSynchronize();
             timer().endGpuTimer();
-            return -1;
+            cudaMemcpy(&count, dev_indices + array_length - 1, 1 * sizeof(int), cudaMemcpyDeviceToHost);
+            cudaMemcpy(odata, dev_odata, count * sizeof(int), cudaMemcpyDeviceToHost);
+
+            return count;
         }
     }
 }
diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h
index 803cb4f..5c75c13 100644
--- a/stream_compaction/efficient.h
+++ b/stream_compaction/efficient.h
@@ -6,7 +6,7 @@ namespace StreamCompaction {
     namespace Efficient {
         StreamCompaction::Common::PerformanceTimer& timer();
 
-        void scan(int n, int *odata, const int *idata);
+        void scan(int n, int *odata, const int *idata, bool timer_on = true);
 
         int compact(int n, int *odata, const int *idata);
     }
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 4308876..2dabdbf 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -3,6 +3,8 @@
 #include "common.h"
 #include "naive.h"
 
+#define blockSize 1024
+
 namespace StreamCompaction {
     namespace Naive {
         using StreamCompaction::Common::PerformanceTimer;
@@ -11,15 +13,79 @@ namespace StreamCompaction {
             static PerformanceTimer timer;
             return timer;
         }
-        // TODO: __global__
+        int* dev_array;
+        int* dev_array2;
+        
+        __global__ void kernScanLayer(
+            int array_length, int stride, int* array, int* array2) {
+            // compute one layer of scan in parallel.
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (index >= array_length - stride) {
+                return;
+            }
+            array[index + stride] += array2[index];
+            __syncthreads();
+        }
 
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+            // deal with non-2-power input
+            int depth = ilog2ceil(n);
+            int array_length = pow(2, depth);
+            if (ilog2(n) != depth) {
+                int* new_idata = new int[array_length];
+                memset(new_idata, 0, array_length * sizeof(int));
+                memcpy(new_idata, idata, n * sizeof(int));
+                idata = new_idata;
+            }
+
+            int num_block;
+            if (array_length < blockSize) {
+                num_block = 1;
+            }
+            else {
+                num_block = array_length / blockSize;
+            }
+
+            dim3 fullBlocksPerGrid((array_length + blockSize - 1) / blockSize);
+            cudaMalloc((void**)&dev_array, array_length * sizeof(int));
+            cudaMalloc((void**)&dev_array2, array_length * sizeof(int));
+            cudaMemcpy(dev_array + 1, idata, (array_length - 1) * sizeof(int), cudaMemcpyHostToDevice);
+            cudaMemset(dev_array, 0, sizeof(int));
+            cudaMemcpy(dev_array2, dev_array, array_length * sizeof(int), cudaMemcpyHostToDevice);
+
             timer().startGpuTimer();
-            // TODO
+            for (int depth_ind = 1; depth_ind <= depth; depth_ind++) {
+                int stride = pow(2, depth_ind - 1);
+                kernScanLayer << <fullBlocksPerGrid, blockSize >> > (array_length, stride, dev_array, dev_array2);
+                cudaMemcpy(dev_array2, dev_array, array_length * sizeof(int), cudaMemcpyHostToDevice);
+                cudaDeviceSynchronize();
+            }
+            
+
             timer().endGpuTimer();
+            cudaMemcpy(odata, dev_array, n * sizeof(int), cudaMemcpyDeviceToHost);
+
+            //int* array_0 = new int[array_length];
+            //int* array_1 = new int[array_length];
+            //cudaMemcpy(array_0, dev_array_dep1, array_length * sizeof(int), cudaMemcpyDeviceToHost);
+            //cudaMemcpy(array_1, dev_array_dep2, array_length * sizeof(int), cudaMemcpyDeviceToHost);
+            //printf("\n");
+            //printf("\n");
+            //for (int ind = 0; ind < array_length; ind++) {
+            //    printf("%d ", array_0[ind]);
+            //}
+            //printf("\n");
+            //printf("\n");
+            //for (int ind = 0; ind < n; ind++) {
+            //    printf("%d ", odata[ind]);
+            //}
+            //printf("\n");
+            //printf("\n");
+            
+            
         }
     }
 }
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index 1def45e..f2a574d 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -18,11 +18,29 @@ namespace StreamCompaction {
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+            int depth = ilog2ceil(n);
+            int array_length = pow(2, depth);
+            if (ilog2(n) != depth) {
+                int* new_idata = new int[array_length];
+                memset(new_idata, 0, array_length * sizeof(int));
+                memcpy(new_idata, idata, n * sizeof(int));
+                idata = new_idata;
+            }
+            thrust::host_vector<int> host_idata(idata, idata + array_length);
+            thrust::host_vector<int> host_odata(array_length);
+            thrust::device_vector<int> dev_idata(array_length);
+            thrust::device_vector<int> dev_odata(array_length);
+            dev_idata = host_idata;
+
             timer().startGpuTimer();
+            thrust::exclusive_scan(dev_idata.begin(), dev_idata.end(), dev_odata.begin());
             // TODO use `thrust::exclusive_scan`
             // example: for device_vectors dv_in and dv_out:
             // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
             timer().endGpuTimer();
+            host_odata = dev_odata;
+            thrust::copy(host_odata.begin(), host_odata.end(), odata);
+
         }
     }
 }