diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0e539a2..9208670 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -13,7 +13,9 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
 elseif(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
     set(EXTERNAL_LIB_PATH "${EXTERNAL}/lib/linux" "/usr/lib64")
 elseif(WIN32)
-    if(${MSVC_VERSION} MATCHES "1900")
+    if(${MSVC_VERSION} MATCHES "1915")
+        set(EXTERNAL_LIB_PATH "${EXTERNAL}/lib/win/vc2015")
+    elseif(${MSVC_VERSION} MATCHES "1900")
         set(EXTERNAL_LIB_PATH "${EXTERNAL}/lib/win/vc2015")
     elseif(${MSVC_VERSION} MATCHES "1800")
         set(EXTERNAL_LIB_PATH "${EXTERNAL}/lib/win/vc2013")
diff --git a/README.md b/README.md
index d63a6a1..e925e2a 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,56 @@
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture,
 Project 1 - Flocking**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Xiao Zhang
+  * [LinkedIn](https://www.linkedin.com/in/xiao-zhang-674bb8148/)
+* Tested on: Windows 10, i7-7700K @ 4.20GHz 16.0GB, GTX 1080 15.96GB (my own PC)
 
-### (TODO: Your README)
+### Screenshot
 
-Include screenshots, analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+The simulation for the screenshot is using uniform grid with coherent storage of position and velocity. There are 20000 boids being simulated and the blocksize for CUDA is set to 128. Vertical synchronization is turned on.
+
+![](images/grid_coherent_20000_128_vsync.gif)
+
+### Analysis
+
+#### The chart below shows the framerate under different blocksize and boids number configuration of all 3 simulation methods.
+
+![](images/chart1.JPG)
+
+#### The graph below shows the framerate under different blocksize and boids number configuration of the naive simulation method.
+
+![](images/graph1.JPG)
+
+#### The graph below shows the framerate under different blocksize and boids number configuration of the uniform grid simulation method with separate memory storage for position and velocity.
+
+![](images/graph2.JPG)
+
+#### The graph below shows the framerate under different blocksize and boids number configuration of the uniform grid simulation method with coherent memory storage for position and velocity.
+
+![](images/graph3.JPG)
+
+#### The chart below shows the framerate under different boids number configuration of all 3 simulation methods when visualization is turned on and CUDA blocksize is set to 128.
+
+![](images/chart2.JPG)
+
+#### The graph below shows the chart above.
+
+![](images/graph4.JPG)
+
+### Q&A
+
+#### 1. For each implementation, how does changing the number of boids affect performance? Why do you think this is?
+
+Increasing the number of boids lower the framerate. This is mainly because there are more data being transfered between GPU and CPU in one frame. Because according to the analysis, framerate doesn't increase when blocksize is larger, which means parallelization is not the problem and every boid is running on their own thread. So the next most possible reason is data throughput.
+
+#### 2. For each implementation, how does changing the block count and block size affect performance? Why do you think this is?
+
+Changing block count and block size hardly affects the performance. This is because the way we are assigning the workload will always assure that every boid will run on their own thread. When we have small block size, the block count will increase to assure that everything still finish in one grid.
+
+#### 3. For the coherent uniform grid: did you experience any performance improvements with the more coherent uniform grid? Was this the outcome you expected? Why or why not?
+
+Yes. Yes. Because instead of accessing position and velocity in a separated manner for multiple times, we are using one grid to reshuffle the position and velocity so that we can access them in a coherent manner later. In this reshuffle process, we are still accessing position and velocity in a separated manner, but we only need to do this once. After reshufling them, we can access them more efficiently, because the GPU cache and the principle of locality.
+
+#### 4. Did changing cell width and checking 27 vs 8 neighboring cells affect performance? Why or why not? Be careful: it is insufficient (and possibly incorrect) to say that 27-cell is slower simply because there are more cells to check!
+
+Not significantly. First, yes, it will affect performance, because there are more cells to check, which means potentially more boids to account for, and the process is not parallelized. But on the other hand, this means each thread has more chance to have equal amount of work to do (extreme case being all boids checking all cells, which is basically the naive method). On GPU, this is a good sign because the same warp will always wait for the slowest to finish, therefore having a balanced workload for each thread will improve performance.
diff --git a/images/chart1.JPG b/images/chart1.JPG
new file mode 100644
index 0000000..90e0fe7
Binary files /dev/null and b/images/chart1.JPG differ
diff --git a/images/chart2.JPG b/images/chart2.JPG
new file mode 100644
index 0000000..51987a4
Binary files /dev/null and b/images/chart2.JPG differ
diff --git a/images/graph1.JPG b/images/graph1.JPG
new file mode 100644
index 0000000..335a80b
Binary files /dev/null and b/images/graph1.JPG differ
diff --git a/images/graph2.JPG b/images/graph2.JPG
new file mode 100644
index 0000000..30a6c71
Binary files /dev/null and b/images/graph2.JPG differ
diff --git a/images/graph3.JPG b/images/graph3.JPG
new file mode 100644
index 0000000..79f680f
Binary files /dev/null and b/images/graph3.JPG differ
diff --git a/images/graph4.JPG b/images/graph4.JPG
new file mode 100644
index 0000000..ccd9964
Binary files /dev/null and b/images/graph4.JPG differ
diff --git a/images/grid_coherent_20000_128_vsync.gif b/images/grid_coherent_20000_128_vsync.gif
new file mode 100644
index 0000000..5945ad5
Binary files /dev/null and b/images/grid_coherent_20000_128_vsync.gif differ
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index fdd636d..030ca66 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -10,5 +10,5 @@ set(SOURCE_FILES
 
 cuda_add_library(src
     ${SOURCE_FILES}
-    OPTIONS -arch=sm_20
+    OPTIONS -arch=sm_60
     )
diff --git a/src/kernel.cu b/src/kernel.cu
index 74dffcb..5bfd3e5 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -6,6 +6,13 @@
 #include "utilityCore.hpp"
 #include "kernel.h"
 
+#define MY_WAY 
+#ifdef MY_WAY
+#define MY_NEIGHBOR_SIZE 3
+#else
+#define YOUR_NEIGHBOR_SIZE 2
+#endif
+
 // LOOK-2.1 potentially useful for doing grid-based neighbor search
 #ifndef imax
 #define imax( a, b ) ( ((a) > (b)) ? (a) : (b) )
@@ -85,6 +92,7 @@ int *dev_gridCellEndIndices;   // to this cell?
 
 // TODO-2.3 - consider what additional buffers you might need to reshuffle
 // the position and velocity data to be coherent within cells.
+glm::vec3 *dev_posEX;
 
 // LOOK-2.1 - Grid parameters based on simulation parameters.
 // These are automatically computed for you in Boids::initSimulation
@@ -133,6 +141,15 @@ __global__ void kernGenerateRandomPosArray(int time, int N, glm::vec3 * arr, flo
   }
 }
 
+__global__ void kernInitVec3Array(int N, glm::vec3 * arr, glm::vec3 value) {
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (index < N) {
+		arr[index] = value;
+	}
+}
+
+__global__ void kernResetIntBuffer(int N, int *intBuffer, int value);//declaration
+
 /**
 * Initialize memory, update some globals
 */
@@ -152,8 +169,8 @@ void Boids::initSimulation(int N) {
   checkCUDAErrorWithLine("cudaMalloc dev_vel2 failed!");
 
   // LOOK-1.2 - This is a typical CUDA kernel invocation.
-  kernGenerateRandomPosArray<<<fullBlocksPerGrid, blockSize>>>(1, numObjects,
-    dev_pos, scene_scale);
+  kernGenerateRandomPosArray << <fullBlocksPerGrid, blockSize >> > (1, numObjects, dev_vel1, scene_scale);//this seems like the key to everything
+  kernGenerateRandomPosArray<<<fullBlocksPerGrid, blockSize>>>(1, numObjects, dev_pos, scene_scale);
   checkCUDAErrorWithLine("kernGenerateRandomPosArray failed!");
 
   // LOOK-2.1 computing grid params
@@ -163,12 +180,34 @@ void Boids::initSimulation(int N) {
 
   gridCellCount = gridSideCount * gridSideCount * gridSideCount;
   gridInverseCellWidth = 1.0f / gridCellWidth;
-  float halfGridWidth = gridCellWidth * halfSideCount;
+  float halfGridWidth = gridCellWidth * halfSideCount;// = scene_scale + gridCellWidth ?
   gridMinimum.x -= halfGridWidth;
   gridMinimum.y -= halfGridWidth;
   gridMinimum.z -= halfGridWidth;
 
   // TODO-2.1 TODO-2.3 - Allocate additional buffers here.
+  cudaMalloc((void**)&dev_particleArrayIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleArrayIndices failed!");
+  kernResetIntBuffer << <fullBlocksPerGrid, blockSize >> > (N, dev_particleArrayIndices, -1);
+
+  cudaMalloc((void**)&dev_particleGridIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleGridIndices failed!");
+  kernResetIntBuffer << <fullBlocksPerGrid, blockSize >> > (N, dev_particleGridIndices, -1);
+
+  cudaMalloc((void**)&dev_gridCellStartIndices, gridCellCount * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellStartIndices failed!");
+  kernResetIntBuffer << <fullBlocksPerGrid, blockSize >> > (gridCellCount, dev_gridCellStartIndices, -1);
+
+  cudaMalloc((void**)&dev_gridCellEndIndices, gridCellCount * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellEndIndices failed!");
+  kernResetIntBuffer << <fullBlocksPerGrid, blockSize >> > (gridCellCount, dev_gridCellEndIndices, -1);
+
+  cudaMalloc((void**)&dev_posEX, N * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("cudaMalloc dev_posEX failed!");
+
+  dev_thrust_particleArrayIndices = thrust::device_ptr<int>(dev_particleArrayIndices);
+  dev_thrust_particleGridIndices = thrust::device_ptr<int>(dev_particleGridIndices);
+
   cudaDeviceSynchronize();
 }
 
@@ -233,7 +272,50 @@ __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *po
   // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
   // Rule 2: boids try to stay a distance d away from each other
   // Rule 3: boids try to match the speed of surrounding boids
-  return glm::vec3(0.0f, 0.0f, 0.0f);
+
+	glm::vec3 cohesionVelocity(0, 0, 0);
+	glm::vec3 separationVelocity(0, 0, 0);
+	glm::vec3 alignmentVelocity(0, 0, 0);
+
+	int cohesionCount = 0;
+	int alignmentCount = 0;
+
+	//rule 1 : cohesion
+	glm::vec3 center(0, 0, 0);
+	for (int i = 0; i < N; i++)
+	{
+		if (i != iSelf && glm::length(pos[i] - pos[iSelf]) < rule1Distance)
+		{
+			center += pos[i];
+			cohesionCount++;
+		}
+	}
+	if(cohesionCount!=0) center /= cohesionCount;
+	cohesionVelocity = (center - pos[iSelf]) * rule1Scale;
+
+	//rule 2 : separation
+	for (int i = 0; i < N; i++)
+	{
+		if (i != iSelf && glm::length(pos[i] - pos[iSelf]) < rule2Distance)
+		{
+			separationVelocity += pos[iSelf] - pos[i];
+		}
+	}
+	separationVelocity *= rule2Scale;
+
+	//rule 3 : alignment
+	for (int i = 0; i < N; i++)
+	{
+		if (i != iSelf && glm::length(pos[i] - pos[iSelf]) < rule3Distance)
+		{
+			alignmentVelocity += vel[i];
+			alignmentCount++;
+		}
+	}
+	if(alignmentCount!=0) alignmentVelocity /= alignmentCount;
+	alignmentVelocity *= rule3Scale;
+
+	return cohesionVelocity + separationVelocity + alignmentVelocity;
 }
 
 /**
@@ -245,6 +327,16 @@ __global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos,
   // Compute a new velocity based on pos and vel1
   // Clamp the speed
   // Record the new velocity into vel2. Question: why NOT vel1?
+
+	int index = threadIdx.x + (blockIdx.x * blockDim.x);
+	if (index >= N) {
+		return;
+	}
+
+	glm::vec3 temp = vel1[index] + computeVelocityChange(N, index, pos, vel1);
+	if (glm::length(temp) > maxSpeed) temp = glm::normalize(temp) * maxSpeed;
+	vel2[index] = temp;
+
 }
 
 /**
@@ -289,6 +381,15 @@ __global__ void kernComputeIndices(int N, int gridResolution,
     // - Label each boid with the index of its grid cell.
     // - Set up a parallel array of integer indices as pointers to the actual
     //   boid data in pos and vel1/vel2
+
+	int index = threadIdx.x + (blockIdx.x * blockDim.x);
+	if (index >= N) {
+		return;
+	}
+
+	glm::ivec3 gridIndex3D = (glm::ivec3)((pos[index] - gridMin) * inverseCellWidth);
+	gridIndices[index] = gridIndex3Dto1D(gridIndex3D.x, gridIndex3D.y, gridIndex3D.z, gridResolution);
+	indices[index] = index;
 }
 
 // LOOK-2.1 Consider how this could be useful for indicating that a cell
@@ -306,6 +407,18 @@ __global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices,
   // Identify the start point of each cell in the gridIndices array.
   // This is basically a parallel unrolling of a loop that goes
   // "this index doesn't match the one before it, must be a new cell!"
+
+	int index = threadIdx.x + (blockIdx.x * blockDim.x);
+	if (index >= N) {
+		return;
+	}
+
+	if ((index - 1 < 0) || (index - 1 >=0 && particleGridIndices[index] != particleGridIndices[index - 1]))
+		gridCellStartIndices[particleGridIndices[index]] = index;
+
+	if ((index + 1 >= N) || (index + 1 < N && particleGridIndices[index] != particleGridIndices[index + 1]))
+		gridCellEndIndices[particleGridIndices[index]] = index;
+
 }
 
 __global__ void kernUpdateVelNeighborSearchScattered(
@@ -322,6 +435,107 @@ __global__ void kernUpdateVelNeighborSearchScattered(
   // - Access each boid in the cell and compute velocity change from
   //   the boids rules, if this boid is within the neighborhood distance.
   // - Clamp the speed change before putting the new speed in vel2
+
+	int index = threadIdx.x + (blockIdx.x * blockDim.x);
+	if (index >= N) {
+		return;
+	}
+
+	glm::ivec3 gridIndex3D = (glm::ivec3)((pos[index] - gridMin) * inverseCellWidth);
+	
+	//which 8 ? I am using 26.
+	int search_start = 0;
+	int search_end = 0;
+
+#ifdef MY_WAY
+	search_start = -(MY_NEIGHBOR_SIZE - 1) / 2;
+	search_end = (MY_NEIGHBOR_SIZE - 1) / 2;
+#else
+	search_start = 0;
+	search_end = YOUR_NEIGHBOR_SIZE - 1;
+#endif
+
+
+	glm::vec3 center(0, 0, 0);
+	glm::vec3 cohesionVelocity(0, 0, 0);
+	glm::vec3 separationVelocity(0, 0, 0);
+	glm::vec3 alignmentVelocity(0, 0, 0);
+
+	int cohesionCount = 0;
+	int alignmentCount = 0;
+
+	for (int i = search_start; i <= search_end; i++)
+	{
+		for (int j = search_start; j <= search_end; j++)
+		{
+			for (int k = search_start; k <= search_end; k++)
+			{
+				glm::ivec3 gridIndex3Dtemp = gridIndex3D + glm::ivec3(i, j, k);
+				if (gridIndex3Dtemp.x < 0) gridIndex3Dtemp.x = gridResolution + gridIndex3Dtemp.x % gridResolution;
+				if (gridIndex3Dtemp.x >= gridResolution) gridIndex3Dtemp.x = gridIndex3Dtemp.x % gridResolution;
+				if (gridIndex3Dtemp.y < 0) gridIndex3Dtemp.y = gridResolution + gridIndex3Dtemp.y % gridResolution;
+				if (gridIndex3Dtemp.y >= gridResolution) gridIndex3Dtemp.y = gridIndex3Dtemp.y % gridResolution;
+				if (gridIndex3Dtemp.z < 0) gridIndex3Dtemp.z = gridResolution + gridIndex3Dtemp.z % gridResolution;
+				if (gridIndex3Dtemp.z >= gridResolution) gridIndex3Dtemp.z = gridIndex3Dtemp.z % gridResolution;
+
+				int gridIndex1Dtemp = gridIndex3Dto1D(gridIndex3Dtemp.x, gridIndex3Dtemp.y, gridIndex3Dtemp.z, gridResolution);
+
+				int start = gridCellStartIndices[gridIndex1Dtemp];
+				int end = gridCellEndIndices[gridIndex1Dtemp];
+
+				for (int a = start; a <= end; a++)
+				{
+					int b = particleArrayIndices[a];
+					//rule 1 : cohesion
+					if (b != index && glm::length(pos[b] - pos[index]) < rule1Distance)
+					{
+						center += pos[b];
+						cohesionCount++;
+					}
+
+					//rule 2 : separation
+					if (b != index && glm::length(pos[b] - pos[index]) < rule2Distance)
+					{
+						separationVelocity += pos[index] - pos[b];
+					}
+
+					//rule 3 : alignment
+					if (b != index && glm::length(pos[b] - pos[index]) < rule3Distance)
+					{
+						alignmentVelocity += vel1[b];
+						alignmentCount++;
+					}
+				}
+			}
+		}
+	}
+
+
+	if(cohesionCount!=0) center /= cohesionCount;
+	cohesionVelocity = (center - pos[index]) * rule1Scale;
+
+	separationVelocity *= rule2Scale;
+
+	if(alignmentCount!=0) alignmentVelocity /= alignmentCount;
+	alignmentVelocity *= rule3Scale;
+
+	glm::vec3 temp = vel1[index] + cohesionVelocity + separationVelocity + alignmentVelocity;
+	if (glm::length(temp) > maxSpeed) temp = glm::normalize(temp) * maxSpeed;
+	vel2[index] = temp;
+}
+
+__global__ void kernReshufflePosVel(
+	int N, int *particleArrayIndices,
+	glm::vec3 *pos, glm::vec3 *posEX,
+	glm::vec3 *vel, glm::vec3 *velEX) {
+
+	int index = threadIdx.x + (blockIdx.x * blockDim.x);
+	if (index >= N) {
+		return;
+	}
+
+	posEX[index] = pos[particleArrayIndices[index]];
+	velEX[index] = vel[particleArrayIndices[index]];
 }
 
 __global__ void kernUpdateVelNeighborSearchCoherent(
@@ -341,6 +555,92 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
   // - Access each boid in the cell and compute velocity change from
   //   the boids rules, if this boid is within the neighborhood distance.
   // - Clamp the speed change before putting the new speed in vel2
+
+	int index = threadIdx.x + (blockIdx.x * blockDim.x);
+	if (index >= N) {
+		return;
+	}
+
+	glm::ivec3 gridIndex3D = (glm::ivec3)((pos[index] - gridMin) * inverseCellWidth);
+
+	//which 8 ? I am using 26.
+	int search_start = 0;
+	int search_end = 0;
+
+#ifdef MY_WAY
+	search_start = -(MY_NEIGHBOR_SIZE - 1) / 2;
+	search_end = (MY_NEIGHBOR_SIZE - 1) / 2;
+#else
+	search_start = 0;
+	search_end = YOUR_NEIGHBOR_SIZE - 1;
+#endif
+
+
+	glm::vec3 center(0, 0, 0);
+	glm::vec3 cohesionVelocity(0, 0, 0);
+	glm::vec3 separationVelocity(0, 0, 0);
+	glm::vec3 alignmentVelocity(0, 0, 0);
+
+	int cohesionCount = 0;
+	int alignmentCount = 0;
+
+	for (int i = search_start; i <= search_end; i++)
+	{
+		for (int j = search_start; j <= search_end; j++)
+		{
+			for (int k = search_start; k <= search_end; k++)
+			{
+				glm::ivec3 gridIndex3Dtemp = gridIndex3D + glm::ivec3(i, j, k);
+				if (gridIndex3Dtemp.x < 0) gridIndex3Dtemp.x = gridResolution + gridIndex3Dtemp.x % gridResolution;
+				if (gridIndex3Dtemp.x >= gridResolution) gridIndex3Dtemp.x = gridIndex3Dtemp.x % gridResolution;
+				if (gridIndex3Dtemp.y < 0) gridIndex3Dtemp.y = gridResolution + gridIndex3Dtemp.y % gridResolution;
+				if (gridIndex3Dtemp.y >= gridResolution) gridIndex3Dtemp.y = gridIndex3Dtemp.y % gridResolution;
+				if (gridIndex3Dtemp.z < 0) gridIndex3Dtemp.z = gridResolution + gridIndex3Dtemp.z % gridResolution;
+				if (gridIndex3Dtemp.z >= gridResolution) gridIndex3Dtemp.z = gridIndex3Dtemp.z % gridResolution;
+
+				int gridIndex1Dtemp = gridIndex3Dto1D(gridIndex3Dtemp.x, gridIndex3Dtemp.y, gridIndex3Dtemp.z, gridResolution);
+
+				int start = gridCellStartIndices[gridIndex1Dtemp];
+				int end = gridCellEndIndices[gridIndex1Dtemp];
+
+				for (int a = start; a <= end; a++)
+				{
+					//rule 1 : cohesion
+					if (a != index && glm::length(pos[a] - pos[index]) < rule1Distance)
+					{
+						center += pos[a];
+						cohesionCount++;
+					}
+
+					//rule 2 : separation
+					if (a != index && glm::length(pos[a] - pos[index]) < rule2Distance)
+					{
+						separationVelocity += pos[index] - pos[a];
+					}
+
+					//rule 3 : alignment
+					if (a != index && glm::length(pos[a] - pos[index]) < rule3Distance)
+					{
+						alignmentVelocity += vel1[a];
+						alignmentCount++;
+					}
+				}
+			}
+		}
+	}
+
+
+	if (cohesionCount != 0) center /= cohesionCount;
+	cohesionVelocity = (center - pos[index]) * rule1Scale;
+
+	separationVelocity *= rule2Scale;
+
+	if (alignmentCount != 0) alignmentVelocity /= alignmentCount;
+	alignmentVelocity *= rule3Scale;
+
+	glm::vec3 temp = vel1[index] + cohesionVelocity + separationVelocity + alignmentVelocity;
+	if (glm::length(temp) > maxSpeed) temp = glm::normalize(temp) * maxSpeed;
+	vel2[index] = temp;
 }
 
 /**
@@ -349,6 +649,15 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
 void Boids::stepSimulationNaive(float dt) {
   // TODO-1.2 - use the kernels you wrote to step the simulation forward in time.
   // TODO-1.2 ping-pong the velocity buffers
+
+	dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+
+	kernUpdateVelocityBruteForce << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_pos, dev_vel1, dev_vel2);
+	kernUpdatePos << <fullBlocksPerGrid, blockSize >> > (numObjects, dt, dev_pos, dev_vel2);
+	
+	glm::vec3 *temp = dev_vel1;
+	dev_vel1 = dev_vel2;
+	dev_vel2 = temp;
 }
 
 void Boids::stepSimulationScatteredGrid(float dt) {
@@ -356,7 +665,7 @@ void Boids::stepSimulationScatteredGrid(float dt) {
   // Uniform Grid Neighbor search using Thrust sort.
   // In Parallel:
   // - label each particle with its array index as well as its grid index.
-  //   Use 2x width grids.
+  //   Use 2x width grids. (what does this mean? it only takes in an inverse width, 2 * gridInverseCellWidth ?)
   // - Unstable key sort using Thrust. A stable sort isn't necessary, but you
   //   are welcome to do a performance comparison.
   // - Naively unroll the loop for finding the start and end indices of each
@@ -364,6 +673,29 @@ void Boids::stepSimulationScatteredGrid(float dt) {
   // - Perform velocity updates using neighbor search
   // - Update positions
   // - Ping-pong buffers as needed
+
+	dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+
+	//label
+	kernComputeIndices << <fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+	
+	//sort
+	thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
+
+	//unroll
+	kernIdentifyCellStartEnd << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+
+	//search & update vel1
+	kernUpdateVelNeighborSearchScattered << <fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth, dev_gridCellStartIndices, dev_gridCellEndIndices, dev_particleArrayIndices, dev_pos, dev_vel1, dev_vel2);
+	//kernUpdateVelocityBruteForce << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_pos, dev_vel1, dev_vel2);
+
+	//update pos
+	kernUpdatePos << <fullBlocksPerGrid, blockSize >> > (numObjects, dt, dev_pos, dev_vel2);
+
+	//exchange vel1 & vel2
+	glm::vec3 *temp = dev_vel1;
+	dev_vel1 = dev_vel2;
+	dev_vel2 = temp;
 }
 
 void Boids::stepSimulationCoherentGrid(float dt) {
@@ -382,6 +714,45 @@ void Boids::stepSimulationCoherentGrid(float dt) {
   // - Perform velocity updates using neighbor search
   // - Update positions
   // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE.
+
+	glm::vec3 *temp = nullptr;//will be used multiple times, so initialize here for clearity
+
+	dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+
+	//label
+	kernComputeIndices << <fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+
+	//sort
+	thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
+
+	//unroll
+	kernIdentifyCellStartEnd << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+
+	//reshuffle, posEX is reshuffled pos, vel2 is reshuffled vel1
+	kernReshufflePosVel << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_particleArrayIndices, dev_pos, dev_posEX, dev_vel1, dev_vel2);
+
+	//exchange pos & posEX, just for clearity and consitency, always read position from pos
+	temp = dev_pos;
+	dev_pos = dev_posEX;
+	dev_posEX = temp;
+
+	//exchange vel1 & vel2, just for clearity and consitency, always read velocity from vel1
+	temp = dev_vel1;
+	dev_vel1 = dev_vel2;
+	dev_vel2 = temp;
+
+	//search & update vel1
+	kernUpdateVelNeighborSearchCoherent << <fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth, dev_gridCellStartIndices, dev_gridCellEndIndices, dev_pos, dev_vel1, dev_vel2);
+	//kernUpdateVelocityBruteForce << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_pos, dev_vel1, dev_vel2);
+
+	//update pos
+	kernUpdatePos << <fullBlocksPerGrid, blockSize >> > (numObjects, dt, dev_pos, dev_vel2);
+
+	//exchange vel1 & vel2
+	temp = dev_vel1;
+	dev_vel1 = dev_vel2;
+	dev_vel2 = temp;
+
 }
 
 void Boids::endSimulation() {
@@ -390,6 +761,12 @@ void Boids::endSimulation() {
   cudaFree(dev_pos);
 
   // TODO-2.1 TODO-2.3 - Free any additional buffers here.
+  cudaFree(dev_particleArrayIndices);
+  cudaFree(dev_particleGridIndices);
+  cudaFree(dev_gridCellEndIndices);
+  cudaFree(dev_gridCellStartIndices);
+
+  cudaFree(dev_posEX);
 }
 
 void Boids::unitTest() {
diff --git a/src/main.cpp b/src/main.cpp
index b82c8c6..1548b47 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -14,11 +14,11 @@
 
 // LOOK-2.1 LOOK-2.3 - toggles for UNIFORM_GRID and COHERENT_GRID
 #define VISUALIZE 1
-#define UNIFORM_GRID 0
-#define COHERENT_GRID 0
+#define UNIFORM_GRID 1
+#define COHERENT_GRID 1
 
 // LOOK-1.2 - change this to adjust particle count in the simulation
-const int N_FOR_VIS = 5000;
+const int N_FOR_VIS = 20000;
 const float DT = 0.2f;
 
 /**
@@ -216,6 +216,12 @@ void initShaders(GLuint * program) {
     double fps = 0;
     double timebase = 0;
     int frame = 0;
+	/////////////////////////////
+	const double start_time = 15;
+	const double end_time = 30;
+	/////////////////////////////
+	double fpsTotal = 0;
+	int fpsCount = 0;
 
     Boids::unitTest(); // LOOK-1.2 We run some basic example code to make sure
                        // your CUDA development setup is ready to go.
@@ -226,12 +232,21 @@ void initShaders(GLuint * program) {
       frame++;
       double time = glfwGetTime();
 
+
       if (time - timebase > 1.0) {
         fps = frame / (time - timebase);
+		/////////////////////////////////////////
+		if (time > start_time && time < end_time)
+		{
+			fpsTotal += fps;
+			fpsCount++;
+		}
+		////////////////////////////////////////
         timebase = time;
         frame = 0;
       }
 
+
       runCUDA();
 
       std::ostringstream ss;
@@ -239,6 +254,12 @@ void initShaders(GLuint * program) {
       ss.precision(1);
       ss << std::fixed << fps;
       ss << " fps] " << deviceName;
+	  /////////////////////////////
+	  if (time > end_time)
+	  {
+		  ss << " [average from " << start_time << "s to " << end_time  << "s = " << (fpsTotal / fpsCount) << "fps]";
+	  }
+	  /////////////////////////////
       glfwSetWindowTitle(window, ss.str().c_str());
 
       glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);