diff --git a/README.md b/README.md
index d63a6a1..3bf34ca 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,73 @@
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture,
 Project 1 - Flocking**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Wanru Zhao, 59981278
+  * [LinkedIn](www.linkedin.com/in/wanru-zhao).
+* Tested on: Windows 10, Intel(R) Xeon(R) CPU E5-1630 v4 @ 3.70GHz, GTX 1070 (SIG Lab)
 
-### (TODO: Your README)
+### Screenshots
 
-Include screenshots, analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+Screenshot of flocking boids
+
+![](images/screenshot.jpg)
+
+GIF
+
+![](images/flocking_cut.gif)
+
+### Performance Analysis
+#### Framerate change with increasing number of boids
+
+Number of Boids | Navie | Uniform Grid | Coherent Grid
+:---|:---:|:---:|:---:
+5000 | 547.471 | 932.684 | 943.434
+10000 | 230.529 | 1546.9 | 1521.98
+15000 | 120.268 | 1549.28 | 1498.19
+20000 | 57.9549 | 1347.32 | 1398.56
+50000 | 19.7976 | 912.897 | 988.624
+100000 | 2.7 | 510.311 | 593.427
+150000 | Crash | 313.939 | 391.472
+
+![](images/fps_boidnum.jpg)
+
+#### Framerate change with increasing block size with 5000 boids
+
+Block Size | Navie | Uniform Grid | Coherent Grid
+:---|:---:|:---:|:---:
+32 | 549.209 | 910.022 | 912.538
+64 | 548.936 | 923.373 | 942.928
+128 | 547.471 | 932.684 | 943.434
+256 | 541.605 | 961.5 | 956.118
+512 | 529.656 | 891.667 | 939.112
+
+![](images/fps_blocksize.JPG)
+
+#### Framerate change with 8 Cells and 21 Cells, block size = 128, Coherent Grid
+
+Number of Boids | 8 Cells | 21 Cells
+:---|:---:|:---:
+5000 | 943.434 | 925.261
+10000 | 1521.98 | 1579.39
+15000 | 1498.19 | 1529.99
+20000 | 1398.56 | 1438.4
+50000 | 988.624 | 981.994
+
+![](images/fps_8v27.JPG)
+
+### Problems
+
+* For each implementation, how does changing the number of boids affect performance? Why do you think this is?
+
+For Naive method, as number of boids increase, the average FPS drops, while fps of uniform and coherent grid firstly increases and then drops, since there are more boids needed to be calculated as neighbors which influence every boid. For grid ones, when the number of boids is not large enough, the cost of computing grids reduces the performance, however, this cost can be neglected when number of boids is large.
+
+* For each implementation, how does changing the block count and block size affect performance? Why do you think this is?
+
+When the block size increases, the average FPS for each method does not change a lot. For Naive, the performance drops, and for grid methods, the performance increases at first and then drops slightly. The warp size is 32 and the number of SMs is 15 for the computer I used. I think the reason for performance decreasing is due to the warp size.
+
+* For the coherent uniform grid: did you experience any performance improvements with the more coherent uniform grid? Was this the outcome you expected? Why or why not?
+
+Yes. When the number of boids is larger than 50000, the performance of coherent grid is better than scattered grid. Since the searching step for shuffled boid indices is skipped.
+
+* Did changing cell width and checking 27 vs 8 neighboring cells affect performance? Why or why not? Be careful: it is insufficient (and possibly incorrect) to say that 27-cell is slower simply because there are more cells to check!
+
+The performance of 27-cell is slightly better than 8-cell when number of boids is within some range. Within this range, the cost of determining which cell should be considered as neighbor of current cell is slightly larger than the cost of iterating more cells.
diff --git a/images/flocking.gif b/images/flocking.gif
new file mode 100644
index 0000000..f6e5ed2
Binary files /dev/null and b/images/flocking.gif differ
diff --git a/images/flocking_cut.gif b/images/flocking_cut.gif
new file mode 100644
index 0000000..28c9a02
Binary files /dev/null and b/images/flocking_cut.gif differ
diff --git a/images/fps_8v27.JPG b/images/fps_8v27.JPG
new file mode 100644
index 0000000..7841556
Binary files /dev/null and b/images/fps_8v27.JPG differ
diff --git a/images/fps_blocksize.JPG b/images/fps_blocksize.JPG
new file mode 100644
index 0000000..fa490ec
Binary files /dev/null and b/images/fps_blocksize.JPG differ
diff --git a/images/fps_boidnum.jpg b/images/fps_boidnum.jpg
new file mode 100644
index 0000000..cd83f52
Binary files /dev/null and b/images/fps_boidnum.jpg differ
diff --git a/images/screenshot.jpg b/images/screenshot.jpg
new file mode 100644
index 0000000..59ac416
Binary files /dev/null and b/images/screenshot.jpg differ
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index fdd636d..b737097 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -10,5 +10,5 @@ set(SOURCE_FILES
 
 cuda_add_library(src
     ${SOURCE_FILES}
-    OPTIONS -arch=sm_20
+    OPTIONS -arch=sm_61
     )
diff --git a/src/kernel.cu b/src/kernel.cu
index 74dffcb..07fea99 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -85,6 +85,8 @@ int *dev_gridCellEndIndices;   // to this cell?
 
 // TODO-2.3 - consider what additional buffers you might need to reshuffle
 // the position and velocity data to be coherent within cells.
+glm::vec3 *dev_coherentPos;
+glm::vec3 *dev_coherentVel1;
 
 // LOOK-2.1 - Grid parameters based on simulation parameters.
 // These are automatically computed for you in Boids::initSimulation
@@ -169,6 +171,27 @@ void Boids::initSimulation(int N) {
   gridMinimum.z -= halfGridWidth;
 
   // TODO-2.1 TODO-2.3 - Allocate additional buffers here.
+  cudaMalloc((void**) &dev_particleArrayIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleArrayIndices failed!");
+
+  cudaMalloc((void**) &dev_particleGridIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleGridIndices failed!");
+
+  cudaMalloc((void**) &dev_gridCellStartIndices, gridCellCount * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellStartIndices failed!");
+
+  cudaMalloc((void**) &dev_gridCellEndIndices, gridCellCount * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellEndIndices failed!");
+
+  dev_thrust_particleArrayIndices = thrust::device_pointer_cast<int>(dev_particleArrayIndices);
+  dev_thrust_particleGridIndices = thrust::device_pointer_cast<int>(dev_particleGridIndices);
+
+  cudaMalloc((void**) &dev_coherentPos, N * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("cudaMalloc dev_coherentPos failed!");
+
+  cudaMalloc((void**) &dev_coherentVel1, N * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("cudaMalloc dev_coherentVel1 failed!");
+
   cudaDeviceSynchronize();
 }
 
@@ -230,10 +253,45 @@ void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities)
 * in the `pos` and `vel` arrays.
 */
 __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *pos, const glm::vec3 *vel) {
-  // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
-  // Rule 2: boids try to stay a distance d away from each other
-  // Rule 3: boids try to match the speed of surrounding boids
-  return glm::vec3(0.0f, 0.0f, 0.0f);
+
+    // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
+    // Rule 2: boids try to stay a distance d away from each other    
+    // Rule 3: boids try to match the speed of surrounding boids
+    glm::vec3 perceived_center(0.0f, 0.0f, 0.0f);
+    glm::vec3 c(0.0f, 0.0f, 0.0f);
+    glm::vec3 perceived_velocity(0.0f, 0.0f, 0.0f);
+
+    int count1 = 0, count2 = 0;
+
+    for (int i = 0; i < N; i++) {
+        if (i != iSelf) {
+            float distance = glm::distance(pos[iSelf], pos[i]);
+            if (distance < rule1Distance) {
+                perceived_center += pos[i];
+                count1++;
+                }
+            if (distance < rule2Distance) {
+                c -= (pos[i] - pos[iSelf]);
+                }
+            if (distance < rule3Distance) {
+                perceived_velocity += vel[i];
+                count2++;
+                }
+            }
+        }
+
+    if (count1 > 0) {
+        perceived_center /= (float)count1;
+        }
+    else {
+        perceived_center = pos[iSelf];
+        }
+
+    if (count2 > 0) {
+        perceived_velocity /= (float)count2;
+        }
+
+    return rule1Scale * (perceived_center - pos[iSelf]) + rule2Scale * c + rule3Scale * perceived_velocity;
 }
 
 /**
@@ -245,6 +303,15 @@ __global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos,
   // Compute a new velocity based on pos and vel1
   // Clamp the speed
   // Record the new velocity into vel2. Question: why NOT vel1?
+    int index = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (index < N) {
+        
+      glm::vec3 new_velocity = computeVelocityChange(N, index, pos, vel1) + vel1[index];
+      if(glm::length(new_velocity) > maxSpeed) {
+        new_velocity *= maxSpeed / glm::length(new_velocity);
+      }
+      vel2[index] = new_velocity;
+    }
 }
 
 /**
@@ -255,8 +322,8 @@ __global__ void kernUpdatePos(int N, float dt, glm::vec3 *pos, glm::vec3 *vel) {
   // Update position by velocity
   int index = threadIdx.x + (blockIdx.x * blockDim.x);
   if (index >= N) {
-    return;
-  }
+      return;
+      }
   glm::vec3 thisPos = pos[index];
   thisPos += vel[index] * dt;
 
@@ -289,6 +356,14 @@ __global__ void kernComputeIndices(int N, int gridResolution,
     // - Label each boid with the index of its grid cell.
     // - Set up a parallel array of integer indices as pointers to the actual
     //   boid data in pos and vel1/vel2
+
+    int index = threadIdx.x + blockDim.x * blockIdx.x;
+    if (index >= N) {
+        return;
+        }
+    glm::vec3 index3D = (pos[index] - gridMin) * inverseCellWidth;
+    gridIndices[index] = gridIndex3Dto1D((int)index3D.x, (int)index3D.y, (int)index3D.z, gridResolution);
+    indices[index] = index;
 }
 
 // LOOK-2.1 Consider how this could be useful for indicating that a cell
@@ -306,6 +381,26 @@ __global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices,
   // Identify the start point of each cell in the gridIndices array.
   // This is basically a parallel unrolling of a loop that goes
   // "this index doesn't match the one before it, must be a new cell!"
+
+    int index = threadIdx.x + blockDim.x * blockIdx.x;
+    if (index >= N) {
+        return;
+        }
+
+    int gridIdx = particleGridIndices[index];
+
+    if (index == 0) {
+        gridCellStartIndices[gridIdx] = 0;
+        return;
+    }
+    if (index == N - 1) {
+        gridCellEndIndices[gridIdx] = N - 1;
+    }
+    int prevGridIdx = particleGridIndices[index - 1];
+    if (prevGridIdx != gridIdx) {
+        gridCellStartIndices[gridIdx] = index;
+        gridCellEndIndices[prevGridIdx] = index - 1;
+    }
 }
 
 __global__ void kernUpdateVelNeighborSearchScattered(
@@ -322,8 +417,94 @@ __global__ void kernUpdateVelNeighborSearchScattered(
   // - Access each boid in the cell and compute velocity change from
   //   the boids rules, if this boid is within the neighborhood distance.
   // - Clamp the speed change before putting the new speed in vel2
+
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  if(index < N) {
+
+    int particleIdx = particleArrayIndices[index];
+    glm::vec3 gridIdx3D = pos[particleIdx];
+	gridIdx3D -= gridMin;
+	gridIdx3D = gridIdx3D * inverseCellWidth;
+
+    int neighborCells[8];
+	int neighborIdxX = (gridIdx3D.x - (int)gridIdx3D.x) > 0.5 ? 1 : -1;
+	int neighborIdxY = (gridIdx3D.y - (int)gridIdx3D.y) > 0.5 ? 1 : -1;
+	int neighborIdxZ = (gridIdx3D.z - (int)gridIdx3D.z) > 0.5 ? 1 : -1;
+    
+    glm::ivec3 offset[8] = {
+      glm::ivec3(0, 0, 0),
+      glm::ivec3(neighborIdxX, 0, 0),
+      glm::ivec3(0, neighborIdxY, 0),
+      glm::ivec3(0, 0, neighborIdxZ),
+      glm::ivec3(neighborIdxX, neighborIdxY, 0),
+      glm::ivec3(neighborIdxX, 0, neighborIdxZ),
+      glm::ivec3(0, neighborIdxY, neighborIdxZ),
+      glm::ivec3(neighborIdxX, neighborIdxY, neighborIdxZ)
+    };
+
+    for(int i = 0; i < 8; i++) {
+      glm::ivec3 idx = glm::ivec3(gridIdx3D) + offset[i];
+      if(idx.x < 0 || idx.x >= gridResolution || idx.y < 0 || idx.y >= gridResolution || idx.z < 0 || idx.z >= gridResolution) {
+		  neighborCells[i] = -1;
+      } else {
+		  neighborCells[i] = gridIndex3Dto1D(idx.x, idx.y, idx.z, gridResolution);
+      }
+    }
+
+    glm::vec3 perceived_center(0.0f);
+    glm::vec3 c(0.0f);
+    glm::vec3 perceived_velocity(0.0f);
+    int count1 = 0, count2 = 0;
+
+    for(int i = 0; i < 8; i++) {
+
+      int gridIdx = neighborCells[i];
+      if(gridIdx == -1) {
+        continue;
+      }
+
+      int startIdx = gridCellStartIndices[gridIdx];
+      int endIdx = gridCellEndIndices[gridIdx];
+
+      for(int j = startIdx; j <= endIdx; j++) {
+        int boid = particleArrayIndices[j];
+        if(boid != particleIdx) {
+          float distance = glm::distance(pos[boid], pos[particleIdx]);
+          if(distance < rule1Distance) {
+            perceived_center += pos[boid];
+            count1++;
+          }
+          if(distance < rule2Distance) {
+            c -= (pos[boid] - pos[particleIdx]);
+          }
+          if(distance < rule3Distance) {
+            perceived_velocity += vel1[boid];
+            count2++;
+          }
+        }
+      }
+    }
+
+    if(count1 > 0) {
+      perceived_center /= count1;
+    } else {
+      perceived_center = pos[particleIdx];
+    }
+
+    if(count2 > 0) {
+      perceived_velocity /= count2;
+    }
+
+    glm::vec3 new_vel = vel1[particleIdx] +  rule1Scale * (perceived_center - pos[particleIdx]) + rule2Scale * c + rule3Scale * perceived_velocity;
+    if(glm::length(new_vel) > maxSpeed) {
+      new_vel *= maxSpeed / glm::length(new_vel);
+    }
+    vel2[particleIdx] = new_vel;
+  }
+
 }
 
+
 __global__ void kernUpdateVelNeighborSearchCoherent(
   int N, int gridResolution, glm::vec3 gridMin,
   float inverseCellWidth, float cellWidth,
@@ -341,6 +522,125 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
   // - Access each boid in the cell and compute velocity change from
   //   the boids rules, if this boid is within the neighborhood distance.
   // - Clamp the speed change before putting the new speed in vel2
+
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  if(index < N) {
+
+    glm::vec3 gridIdx3D = (pos[index] - gridMin) * inverseCellWidth;
+
+	
+    int neighborCells[8];
+    int neighborIdxX = (gridIdx3D.x - (int)gridIdx3D.x) > 0.5? 1 : -1;
+    int neighborIdxY = (gridIdx3D.y - (int)gridIdx3D.y) > 0.5? 1 : -1;
+    int neighborIdxZ = (gridIdx3D.z - (int)gridIdx3D.z) > 0.5? 1 : -1;
+    
+    glm::ivec3 offset[8] = {
+      glm::ivec3(0, 0, 0),
+      glm::ivec3(neighborIdxX, 0, 0),
+      glm::ivec3(0, neighborIdxY, 0),
+      glm::ivec3(0, 0, neighborIdxZ),
+      glm::ivec3(neighborIdxX, neighborIdxY, 0),
+      glm::ivec3(neighborIdxX, 0, neighborIdxZ),
+      glm::ivec3(0, neighborIdxY, neighborIdxZ),
+      glm::ivec3(neighborIdxX, neighborIdxY, neighborIdxZ)
+    };
+
+    for(int i = 0; i < 8; i++) {
+      glm::ivec3 idx = glm::ivec3(gridIdx3D) + offset[i];
+      if(idx.x < 0 || idx.x >= gridResolution || idx.y < 0 || idx.y >= gridResolution || idx.z < 0 || idx.z >= gridResolution) {
+		  neighborCells[i] = -1;
+      } else {
+		  neighborCells[i] = gridIndex3Dto1D(idx.x, idx.y, idx.z, gridResolution);
+      }
+    }
+	
+
+
+	/// nearest 27 cells
+	/*
+	int neighborCells[27];
+
+	for (int i = -1; i <= 1; i++) {
+		for (int j = -1; j <= 1; j++) {
+			for (int k = -1; k <= 1; k++) {
+				glm::ivec3 idx = glm::ivec3(gridIdx3D) + glm::ivec3(i, j, k);
+				int cellidx = i + 3 * j + 3 * 3 * k;
+				if (idx.x < 0 || idx.x >= gridResolution || idx.y < 0 || idx.y >= gridResolution || idx.z < 0 || idx.z >= gridResolution) {
+					neighborCells[cellidx] = -1;
+				}
+				else {
+					neighborCells[cellidx] = gridIndex3Dto1D(idx.x, idx.y, idx.z, gridResolution);
+				}
+			}
+		}
+	}
+	*/
+
+
+
+    glm::vec3 perceived_center(0.0f);
+    glm::vec3 c(0.0f);
+    glm::vec3 perceived_velocity(0.0f);
+    int count1 = 0, count2 = 0;
+
+    for(int i = 0; i < 8; i++) {
+
+      int gridIdx = neighborCells[i];
+      if(gridIdx == -1) {
+        continue;
+      }
+
+      int startIdx = gridCellStartIndices[gridIdx];
+      int endIdx = gridCellEndIndices[gridIdx];
+
+      for(int j = startIdx; j <= endIdx; j++) {
+        if(j != index) {
+          float distance = glm::distance(pos[j], pos[index]);
+          if(distance < rule1Distance) {
+            perceived_center += pos[j];
+            count1++;
+          }
+          if(distance < rule2Distance) {
+            c -= (pos[j] - pos[index]);
+          }
+          if(distance < rule3Distance) {
+            perceived_velocity += vel1[j];
+            count2++;
+          }
+        }
+      }
+    }
+
+    if(count1 > 0) {
+      perceived_center /= count1;
+    } else {
+      perceived_center = pos[index];
+    }
+
+    if(count2 > 0) {
+      perceived_velocity /= count2;
+    }
+
+    glm::vec3 new_vel = vel1[index] +  rule1Scale * (perceived_center - pos[index]) + rule2Scale * c + rule3Scale * perceived_velocity;
+    if(glm::length(new_vel) > maxSpeed) {
+      new_vel *= maxSpeed / glm::length(new_vel);
+    }
+    vel2[index] = new_vel;
+  }
+
+}
+
+__global__ void kernReshufflePosAndVel(
+  int N, int *particleArrayIndices,glm::vec3 *pos,
+  glm::vec3 *vel1, glm::vec3 *shuffle_pos, glm::vec3 *shuffle_vel1) {
+
+    int index = threadIdx.x + blockDim.x * blockIdx.x;
+    if(index < N) {
+      int idx = particleArrayIndices[index];
+      shuffle_pos[index] = pos[idx];
+      shuffle_vel1[index] = vel1[idx];
+    }
+
 }
 
 /**
@@ -349,6 +649,15 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
 void Boids::stepSimulationNaive(float dt) {
   // TODO-1.2 - use the kernels you wrote to step the simulation forward in time.
   // TODO-1.2 ping-pong the velocity buffers
+    dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+
+    kernUpdateVelocityBruteForce << <fullBlocksPerGrid, blockSize >> >(numObjects, dev_pos, dev_vel1, dev_vel2);
+    checkCUDAErrorWithLine("kernUpdateVelocityBruteForce failed!");
+    kernUpdatePos << <fullBlocksPerGrid, blockSize >> > (numObjects, dt, dev_pos, dev_vel2);
+    checkCUDAErrorWithLine("Naive kernUpdatePos failed!");
+
+    std::swap(dev_vel1, dev_vel2);
+
 }
 
 void Boids::stepSimulationScatteredGrid(float dt) {
@@ -364,6 +673,31 @@ void Boids::stepSimulationScatteredGrid(float dt) {
   // - Perform velocity updates using neighbor search
   // - Update positions
   // - Ping-pong buffers as needed
+
+	dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+	kernComputeIndices<<<fullBlocksPerGrid, blockSize>>>(numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+	checkCUDAErrorWithLine("ScatterGrid kernComputeIndices failed!");
+
+	dev_thrust_particleArrayIndices = thrust::device_pointer_cast<int>(dev_particleArrayIndices);
+	dev_thrust_particleGridIndices = thrust::device_pointer_cast<int>(dev_particleGridIndices);
+	thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
+
+	dim3 fullBlocksPerGridCell((gridCellCount + blockSize -1 ) / blockSize);
+	kernResetIntBuffer<<<fullBlocksPerGridCell, blockSize>>>(gridCellCount, dev_gridCellStartIndices, -1);
+	checkCUDAErrorWithLine("ScatterGrid kernResetIntBuffer failed!");
+	kernResetIntBuffer<<<fullBlocksPerGridCell, blockSize>>>(gridCellCount, dev_gridCellEndIndices, -1);
+	checkCUDAErrorWithLine("ScatterGrid kernResetIntBuffer failed!");
+
+	kernIdentifyCellStartEnd<<<fullBlocksPerGrid, blockSize>>>(numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+	checkCUDAErrorWithLine("ScatterGrid kernIdentifyStartEnd failed!");
+
+	kernUpdateVelNeighborSearchScattered<<<fullBlocksPerGrid, blockSize>>>(numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth, dev_gridCellStartIndices, dev_gridCellEndIndices, dev_particleArrayIndices, dev_pos, dev_vel1, dev_vel2);
+	checkCUDAErrorWithLine("ScatterGrid kernUpdateVelNeighborSearchScattered failed!");
+
+	kernUpdatePos<<<fullBlocksPerGrid, blockSize>>>(numObjects, dt, dev_pos, dev_vel2);
+	checkCUDAErrorWithLine("ScatterGrid kernUpdatePos failed!");
+
+	std::swap(dev_vel1, dev_vel2);
 }
 
 void Boids::stepSimulationCoherentGrid(float dt) {
@@ -382,6 +716,33 @@ void Boids::stepSimulationCoherentGrid(float dt) {
   // - Perform velocity updates using neighbor search
   // - Update positions
   // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE.
+
+	dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+	kernComputeIndices<<<fullBlocksPerGrid, blockSize>>>(numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+	checkCUDAErrorWithLine("ScatterGrid kernComputeIndices failed!");
+
+	thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
+
+	dim3 fullBlocksPerGridCell((gridCellCount + blockSize -1 ) / blockSize);
+	kernResetIntBuffer<<<fullBlocksPerGridCell, blockSize>>>(gridCellCount, dev_gridCellStartIndices, -1);
+	checkCUDAErrorWithLine("ScatterGrid kernResetIntBuffer failed!");
+	kernResetIntBuffer<<<fullBlocksPerGridCell, blockSize>>>(gridCellCount, dev_gridCellEndIndices, -1);
+	checkCUDAErrorWithLine("ScatterGrid kernResetIntBuffer failed!");
+
+	kernIdentifyCellStartEnd<<<fullBlocksPerGrid, blockSize>>>(numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+	checkCUDAErrorWithLine("ScatterGrid kernIdentifyStartEnd failed!");
+
+	kernReshufflePosAndVel<<<fullBlocksPerGrid, blockSize>>>(numObjects, dev_particleArrayIndices, dev_pos, dev_vel1, dev_coherentPos, dev_coherentVel1);
+	checkCUDAErrorWithLine("ScatterGrid kernReshufflePosAndVel failed!");
+
+	kernUpdateVelNeighborSearchScattered<<<fullBlocksPerGrid, blockSize>>>(numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth, dev_gridCellStartIndices, dev_gridCellEndIndices, dev_particleArrayIndices, dev_coherentPos, dev_coherentVel1, dev_vel2);
+	checkCUDAErrorWithLine("ScatterGrid kernUpdateVelNeighborSearchScattered failed!");
+
+	kernUpdatePos<<<fullBlocksPerGrid, blockSize>>>(numObjects, dt, dev_coherentPos, dev_vel2);
+	checkCUDAErrorWithLine("ScatterGrid kernUpdatePos failed!");
+
+	std::swap(dev_pos, dev_coherentPos);
+	std::swap(dev_vel1, dev_vel2);
 }
 
 void Boids::endSimulation() {
@@ -390,6 +751,14 @@ void Boids::endSimulation() {
   cudaFree(dev_pos);
 
   // TODO-2.1 TODO-2.3 - Free any additional buffers here.
+  cudaFree(dev_particleArrayIndices);
+  cudaFree(dev_particleGridIndices);
+  cudaFree(dev_gridCellEndIndices);
+  cudaFree(dev_gridCellStartIndices);
+
+  cudaFree(dev_coherentPos);
+  cudaFree(dev_coherentVel1);
+
 }
 
 void Boids::unitTest() {
diff --git a/src/main.cpp b/src/main.cpp
index b82c8c6..4504b21 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -13,12 +13,12 @@
 // ================
 
 // LOOK-2.1 LOOK-2.3 - toggles for UNIFORM_GRID and COHERENT_GRID
-#define VISUALIZE 1
-#define UNIFORM_GRID 0
-#define COHERENT_GRID 0
+#define VISUALIZE 0
+#define UNIFORM_GRID 1
+#define COHERENT_GRID 1
 
 // LOOK-1.2 - change this to adjust particle count in the simulation
-const int N_FOR_VIS = 5000;
+const int N_FOR_VIS = 50000;
 const float DT = 0.2f;
 
 /**
@@ -217,6 +217,10 @@ void initShaders(GLuint * program) {
     double timebase = 0;
     int frame = 0;
 
+	double fpsSum = 0.0;
+	double cudaTime = 0.0;
+
+
     Boids::unitTest(); // LOOK-1.2 We run some basic example code to make sure
                        // your CUDA development setup is ready to go.
 
@@ -224,12 +228,15 @@ void initShaders(GLuint * program) {
       glfwPollEvents();
 
       frame++;
+	  fpsSum++;
+	  
       double time = glfwGetTime();
-
+	  cudaTime = time;
       if (time - timebase > 1.0) {
-        fps = frame / (time - timebase);
-        timebase = time;
-        frame = 0;
+		cudaTime += (time - timebase);
+		fps = frame / (time - timebase);
+		timebase = time;
+		frame = 0;	
       }
 
       runCUDA();
@@ -258,6 +265,9 @@ void initShaders(GLuint * program) {
     }
     glfwDestroyWindow(window);
     glfwTerminate();
+
+	std::cout << "running time: " << cudaTime << ", average fps: " << fpsSum / cudaTime << std::endl;
+
   }