diff --git a/README.md b/README.md
index d63a6a1..5a2d07a 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,42 @@
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture,
 Project 1 - Flocking**
 
-* (TODO) YOUR NAME HERE
+Dear Grader!
+
+As you may not know, I registered late for the class(Actually registered on the last day of registration), and I negotiated with Professor Shehzan about due dates via email and  my Project 1 due date is Sept 22. However as you can see, I was late for about 26 hours. I know there are 4 late days without penalty but I only want to use ONE on this project. I didn't mean to waste a late day only for the extra 2 hours. You can deduct points for being late for one day (hopefully 2 hours!)
+
+Thanks
+
+* Xiao Wei
   * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Tested on:  Windows 10, i9-9900k @ 3.6GHz 16.0GB, RTX 2080 SUPER 16GB
 
 ### (TODO: Your README)
 
-Include screenshots, analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+Simulation gif
+
+
+![p1-6](https://user-images.githubusercontent.com/66859615/134622482-b29722aa-0751-4b32-9732-9a65009cc15f.gif)
+
+![p1-5](https://user-images.githubusercontent.com/66859615/134622494-2c6ffde0-e230-4ce1-b757-a03d36d04777.gif)
+
+
+
+![20210924054902](https://user-images.githubusercontent.com/66859615/134626140-86cab152-370f-4cfa-9b1c-79ab40f0faa6.jpg)
+
+![20210924055435](https://user-images.githubusercontent.com/66859615/134626143-63d35c54-4bd2-4be4-816c-46a207ff12b4.jpg)
+
+![20210924061425](https://user-images.githubusercontent.com/66859615/134627047-6a582d78-b49f-45cf-bb48-43236121419a.jpg)
+
+## For each implementation, how does changing the number of boids affect performance? Why do you think this is?
+Performance drops as we increase the number of boids. Each boid in the flocking takes resource(thread) and computational resource #to get their velocity and postion. More boids, more work
+
+## For each implementation, how does changing the block count and block size affect performance? Why do you think this is?
+Changing block size actually does not change performance drastically. Acquiring more block only needs simple operation
+
+##  For the coherent uniform grid: did you experience any performance improvements with the more coherent uniform grid? Was this the outcome you expected? Why or why not?
+Yes, the performance improved. I expected the outcome since we don't bother to access dev_particleArrayIndices 
+
+## Did changing cell width and checking 27 vs 8 neighboring cells affect performance? Why or why not? Be careful: it is insufficient (and possibly incorrect) to say that 27-cell is slower simply because there are more cells to check!
+
+27 Cells actually is faster somehow, I guess the smaller 27 cells give more granularity for paralleling.
diff --git a/src/kernel.cu b/src/kernel.cu
index 74dffcb..904e858 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -5,6 +5,7 @@
 #include <glm/glm.hpp>
 #include "utilityCore.hpp"
 #include "kernel.h"
+//#include "device_launch_parameters.h"
 
 // LOOK-2.1 potentially useful for doing grid-based neighbor search
 #ifndef imax
@@ -85,6 +86,9 @@ int *dev_gridCellEndIndices;   // to this cell?
 
 // TODO-2.3 - consider what additional buffers you might need to reshuffle
 // the position and velocity data to be coherent within cells.
+glm::vec3* dev_posShuffle;
+glm::vec3* dev_velShuffle;
+
 
 // LOOK-2.1 - Grid parameters based on simulation parameters.
 // These are automatically computed for you in Boids::initSimulation
@@ -169,6 +173,27 @@ void Boids::initSimulation(int N) {
   gridMinimum.z -= halfGridWidth;
 
   // TODO-2.1 TODO-2.3 - Allocate additional buffers here.
+
+  cudaMalloc((void**)&dev_particleArrayIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleArrayIndices failed!");
+
+  cudaMalloc((void**)&dev_particleGridIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleGridIndices failed!");
+
+  cudaMalloc((void**)&dev_gridCellStartIndices, gridCellCount * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellStartIndices failed!");
+
+  cudaMalloc((void**)&dev_gridCellEndIndices, gridCellCount * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellEndIndices failed!");
+
+
+  cudaMalloc((void**)&dev_posShuffle, N * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("cudaMalloc dev_posShuffle failed!");
+
+  cudaMalloc((void**)&dev_velShuffle, N * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("cudaMalloc dev_velShuffle failed!");
+
+
   cudaDeviceSynchronize();
 }
 
@@ -229,11 +254,137 @@ void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities)
 * Compute the new velocity on the body with index `iSelf` due to the `N` boids
 * in the `pos` and `vel` arrays.
 */
-__device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *pos, const glm::vec3 *vel) {
-  // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
-  // Rule 2: boids try to stay a distance d away from each other
+__device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3* pos, const glm::vec3* vel) {
+    // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
+    /*
+    function rule1(Boid boid)
+
+      Vector perceived_center
+
+      foreach Boid b:
+          if b != boid and distance(b, boid) < rule1Distance then
+              perceived_center += b.position
+          endif
+      end
+
+      perceived_center /= number_of_neighbors
+
+      return (perceived_center - boid.position) * rule1Scale
+  end
+
+
+    */
+
+    
+    glm::vec3 result(0.0f, 0.0f, 0.0f);
+
+
+    glm::vec3 perceivedCenter(0.0f, 0.0f, 0.0f);
+    int numRule1Neighbor = 0;
+
+    for (int i = 0; i < N; i++) {
+        if (i == iSelf) {
+            continue;
+        }
+
+        float distance = glm::distance(pos[i], pos[iSelf]);
+
+        if (distance < rule1Distance) {
+            perceivedCenter += pos[i];
+            numRule1Neighbor++;
+        }
+
+    }
+
+    if (numRule1Neighbor > 0) {
+        perceivedCenter /= numRule1Neighbor;
+    }
+
+    result += (perceivedCenter - pos[iSelf]) * rule1Scale;
+    
+
+    // Rule 2: boids try to stay a distance d away from each other
+      /*
+      function rule2(Boid boid)
+
+      Vector c = 0
+
+          foreach Boid b
+          if b != boid and distance(b, boid) < rule2Distance then
+              c -= (b.position - boid.position)
+              endif
+              end
+
+              return c * rule2Scale
+              end
+      */
+    
+    glm::vec3 separate(0.0f, 0.0f, 0.0f);
+
+    for (int i = 0; i < N; i++) {
+        if (i == iSelf) {
+            continue;
+        }
+        float distance = glm::distance(pos[i], pos[iSelf]);
+
+        if (distance < rule2Distance) {
+            separate -= (pos[i] - pos[iSelf]);
+        }
+
+    }
+    
+    result += separate * rule2Scale;
+    
+    
+
   // Rule 3: boids try to match the speed of surrounding boids
-  return glm::vec3(0.0f, 0.0f, 0.0f);
+  /*
+  function rule3(Boid boid)
+
+    Vector perceived_velocity
+
+    foreach Boid b
+        if b != boid and distance(b, boid) < rule3Distance then
+            perceived_velocity += b.velocity
+        endif
+    end
+
+    perceived_velocity /= number_of_neighbors
+
+    return perceived_velocity * rule3Scale
+  end
+  */
+    
+    glm::vec3 perceivedVelocity(0.0f, 0.0f, 0.0f);
+    float numRule3Neighbor = 0;
+
+
+    for (int i = 0; i < N; i++) {
+        if (i == iSelf) {
+            continue;
+        }
+        float distance = glm::distance(pos[i], pos[iSelf]);
+
+        if (distance < rule3Distance) {
+            perceivedVelocity += vel[i];
+            numRule3Neighbor++;
+        }
+
+    }
+
+    if (numRule3Neighbor > 0) {
+        perceivedVelocity = perceivedVelocity / numRule3Neighbor;
+    }
+
+    result += perceivedVelocity * rule3Scale;
+
+
+    return result;
+
+   
+
+
+   
 }
 
 /**
@@ -245,6 +396,25 @@ __global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos,
   // Compute a new velocity based on pos and vel1
   // Clamp the speed
   // Record the new velocity into vel2. Question: why NOT vel1?
+    
+    int index = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (index >= N) {
+        return;
+    }
+    
+    glm::vec3 changedVelocity = vel1[index] + computeVelocityChange(N, index, pos, vel1);
+
+    if (glm::length(changedVelocity) > maxSpeed) {
+        
+        changedVelocity = changedVelocity * maxSpeed / glm::length(changedVelocity);
+
+    }
+
+    vel2[index] = changedVelocity;
+    
+
+    
+
 }
 
 /**
@@ -289,6 +459,21 @@ __global__ void kernComputeIndices(int N, int gridResolution,
     // - Label each boid with the index of its grid cell.
     // - Set up a parallel array of integer indices as pointers to the actual
     //   boid data in pos and vel1/vel2
+
+
+    int index = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (index >= N) {
+        return; 
+    }
+
+    int iX = std::floor((pos[index].x - gridMin.x) * inverseCellWidth);
+    int iY = std::floor((pos[index].y - gridMin.y) * inverseCellWidth);
+    int iZ = std::floor((pos[index].z - gridMin.z) * inverseCellWidth);
+
+    indices[index] = index;
+    gridIndices[index] = gridIndex3Dto1D(iX, iY, iZ, gridResolution);
+
+    
 }
 
 // LOOK-2.1 Consider how this could be useful for indicating that a cell
@@ -306,6 +491,28 @@ __global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices,
   // Identify the start point of each cell in the gridIndices array.
   // This is basically a parallel unrolling of a loop that goes
   // "this index doesn't match the one before it, must be a new cell!"
+
+    int index = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (index >= N) {
+        return;
+    }
+
+    int gridIndex = particleGridIndices[index];
+    
+    if (index == 0) {
+        gridCellStartIndices[gridIndex] = 0;
+      
+    }
+    else {
+        int gridIndexLeft = particleGridIndices[index - 1];
+
+        if (gridIndexLeft != gridIndex) {
+            gridCellStartIndices[gridIndex] = index;
+            gridCellEndIndices[gridIndexLeft] = index - 1;
+        }
+    }
+  
+
 }
 
 __global__ void kernUpdateVelNeighborSearchScattered(
@@ -316,12 +523,107 @@ __global__ void kernUpdateVelNeighborSearchScattered(
   glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) {
   // TODO-2.1 - Update a boid's velocity using the uniform grid to reduce
   // the number of boids that need to be checked.
+
+    int iSelf = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (iSelf >= N) {
+        return;
+    }
+
+
+    glm::vec3 result(0.0f, 0.0f, 0.0f);
+
+    glm::vec3 perceivedCenter(0.0f, 0.0f, 0.0f);
+    int numRule1Neighbor = 0;
+
+    glm::vec3 separate(0.0f, 0.0f, 0.0f);
+
+    glm::vec3 perceivedVelocity(0.0f, 0.0f, 0.0f);
+    float numRule3Neighbor = 0;
+
   // - Identify the grid cell that this particle is in
-  // - Identify which cells may contain neighbors. This isn't always 8.
-  // - For each cell, read the start/end indices in the boid pointer array.
-  // - Access each boid in the cell and compute velocity change from
-  //   the boids rules, if this boid is within the neighborhood distance.
+    glm::vec3 posSelf = pos[iSelf];
+    glm::vec3 gridPos = (posSelf - gridMin) * inverseCellWidth;
+    glm::vec3 gridPosFloor = floor(gridPos);
+    glm::vec3 gridPosRound = round(gridPos);
+    int gridIndex = gridIndex3Dto1D(gridPosFloor.x, gridPosFloor.y, gridPosFloor.z, gridResolution);
+
+    // - Identify which cells may contain neighbors. This isn't always 8.
+
+    for (int x = -1; x < 1; x++)
+    {
+        for (int y = -1; y < 1; y++)
+        {
+            for (int z = -1; z < 1; z++)
+            {
+                int neighborX = gridPosRound.x + x;
+                int neighborY = gridPosRound.y + y;
+                int neighborZ = gridPosRound.z + z;
+
+                if (neighborX < 0 || neighborX >= gridResolution || neighborY < 0 || neighborY >= gridResolution || neighborZ < 0 || neighborZ >= gridResolution)
+                    continue;
+
+                // - For each cell, read the start/end indices in the boid pointer array.
+                int neighborGridIndex = gridIndex3Dto1D(neighborX, neighborY, neighborZ, gridResolution);
+
+                int startIndex = gridCellStartIndices[neighborGridIndex];
+                int endIndex = gridCellEndIndices[neighborGridIndex];
+
+
+                // - Access each boid in the cell and compute velocity change from
+                //   the boids rules, if this boid is within the neighborhood distance.
+                // - Clamp the speed change before putting the new speed in vel2
+
+                for (int cellIndex = startIndex; cellIndex <= endIndex; cellIndex++) {
+                    int i = particleArrayIndices[cellIndex];
+                    if (i == iSelf) {
+                        continue;
+                    }
+
+                    float distance = glm::distance(pos[i], pos[iSelf]);
+
+                    if (distance < rule1Distance) {
+                        perceivedCenter += pos[i];
+                        numRule1Neighbor++;
+                    }
+                    
+                    if (distance < rule2Distance) {
+                        separate -= (pos[i] - pos[iSelf]);
+                    }
+
+                    if (distance < rule3Distance) {
+                        perceivedVelocity += vel1[i];
+                        numRule3Neighbor++;
+                    }
+                }
+            }
+        }
+    }
+
+    if (numRule1Neighbor > 0) {
+        perceivedCenter /= numRule1Neighbor;
+    }
+    result += (perceivedCenter - pos[iSelf]) * rule1Scale;
+    result += separate * rule2Scale;
+
+    if (numRule3Neighbor > 0) {
+        perceivedVelocity = perceivedVelocity / numRule3Neighbor;
+    }
+
+    result += perceivedVelocity * rule3Scale;
+
   // - Clamp the speed change before putting the new speed in vel2
+    glm::vec3 changedVelocity = vel1[iSelf] + result;
+
+    if (glm::length(changedVelocity) > maxSpeed) {
+
+        changedVelocity = changedVelocity * maxSpeed / glm::length(changedVelocity);
+
+    }
+
+    vel2[iSelf] = changedVelocity;
+    
+
+
 }
 
 __global__ void kernUpdateVelNeighborSearchCoherent(
@@ -341,6 +643,106 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
   // - Access each boid in the cell and compute velocity change from
   //   the boids rules, if this boid is within the neighborhood distance.
   // - Clamp the speed change before putting the new speed in vel2
+
+    int iSelf = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (iSelf >= N) {
+        return;
+    }
+
+
+    glm::vec3 result(0.0f, 0.0f, 0.0f);
+
+    glm::vec3 perceivedCenter(0.0f, 0.0f, 0.0f);
+    int numRule1Neighbor = 0;
+
+    glm::vec3 separate(0.0f, 0.0f, 0.0f);
+
+    glm::vec3 perceivedVelocity(0.0f, 0.0f, 0.0f);
+    float numRule3Neighbor = 0;
+
+    // - Identify the grid cell that this particle is in
+    glm::vec3 posSelf = pos[iSelf];
+    glm::vec3 gridPos = (posSelf - gridMin) * inverseCellWidth;
+    glm::vec3 gridPosFloor = floor(gridPos);
+    glm::vec3 gridPosRound = round(gridPos);
+    int gridIndex = gridIndex3Dto1D(gridPosFloor.x, gridPosFloor.y, gridPosFloor.z, gridResolution);
+
+    // - Identify which cells may contain neighbors. This isn't always 8.
+
+    for (int x = -1; x < 1; x++)
+    {
+        for (int y = -1; y < 1; y++)
+        {
+            for (int z = -1; z < 1; z++)
+            {
+                int neighborX = gridPosRound.x + x;
+                int neighborY = gridPosRound.y + y;
+                int neighborZ = gridPosRound.z + z;
+
+                if (neighborX < 0 || neighborX >= gridResolution || neighborY < 0 || neighborY >= gridResolution || neighborZ < 0 || neighborZ >= gridResolution)
+                    continue;
+
+                // - For each cell, read the start/end indices in the boid pointer array.
+                int neighborGridIndex = gridIndex3Dto1D(neighborX, neighborY, neighborZ, gridResolution);
+
+                int startIndex = gridCellStartIndices[neighborGridIndex];
+                int endIndex = gridCellEndIndices[neighborGridIndex];
+
+
+                // - Access each boid in the cell and compute velocity change from
+                //   the boids rules, if this boid is within the neighborhood distance.
+                // - Clamp the speed change before putting the new speed in vel2
+
+                for (int i = startIndex; i <= endIndex; i++) {
+                 
+                    if (i == iSelf) {
+                        continue;
+                    }
+
+                    float distance = glm::distance(pos[i], pos[iSelf]);
+
+                    if (distance < rule1Distance) {
+                        perceivedCenter += pos[i];
+                        numRule1Neighbor++;
+                    }
+
+                    if (distance < rule2Distance) {
+                        separate -= (pos[i] - pos[iSelf]);
+                    }
+
+                    if (distance < rule3Distance) {
+                        perceivedVelocity += vel1[i];
+                        numRule3Neighbor++;
+                    }
+                }
+            }
+        }
+    }
+
+    if (numRule1Neighbor > 0) {
+        perceivedCenter /= numRule1Neighbor;
+    }
+    result += (perceivedCenter - pos[iSelf]) * rule1Scale;
+    result += separate * rule2Scale;
+
+    if (numRule3Neighbor > 0) {
+        perceivedVelocity = perceivedVelocity / numRule3Neighbor;
+    }
+
+    result += perceivedVelocity * rule3Scale;
+
+    // - Clamp the speed change before putting the new speed in vel2
+    glm::vec3 changedVelocity = vel1[iSelf] + result;
+
+    if (glm::length(changedVelocity) > maxSpeed) {
+
+        changedVelocity = changedVelocity * maxSpeed / glm::length(changedVelocity);
+
+    }
+
+    vel2[iSelf] = changedVelocity;
+
+
 }
 
 /**
@@ -348,7 +750,12 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
 */
 void Boids::stepSimulationNaive(float dt) {
   // TODO-1.2 - use the kernels you wrote to step the simulation forward in time.
+    dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+    kernUpdateVelocityBruteForce <<<fullBlocksPerGrid, blockSize >> > (numObjects, dev_pos, dev_vel1, dev_vel2);
+    kernUpdatePos << <fullBlocksPerGrid, blockSize >> > (numObjects, dt, dev_pos, dev_vel2);
   // TODO-1.2 ping-pong the velocity buffers
+    dev_vel1 = dev_vel2;
+
 }
 
 void Boids::stepSimulationScatteredGrid(float dt) {
@@ -364,8 +771,48 @@ void Boids::stepSimulationScatteredGrid(float dt) {
   // - Perform velocity updates using neighbor search
   // - Update positions
   // - Ping-pong buffers as needed
+    
+    dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+    dim3 numGrid((gridCellCount + blockSize - 1) / blockSize);
+
+    kernComputeIndices << <fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth,
+        dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+
+    thrust::device_ptr<int> dev_thrust_particleGridIndices(dev_particleGridIndices);
+    thrust::device_ptr<int> dev_thrust_particleArrayIndices(dev_particleArrayIndices);
+    thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
+
+    kernResetIntBuffer << <numGrid, blockSize >> > (gridCellCount, dev_gridCellStartIndices, -1);
+    kernResetIntBuffer << <numGrid, blockSize >> > (gridCellCount, dev_gridCellEndIndices, -1);
+
+    kernIdentifyCellStartEnd << < fullBlocksPerGrid, blockSize >> > (numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+    kernUpdateVelNeighborSearchScattered << < fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth,
+        dev_gridCellStartIndices, dev_gridCellEndIndices, dev_particleArrayIndices, dev_pos, dev_vel1, dev_vel2);
+    kernUpdatePos << < fullBlocksPerGrid, blockSize >> > (numObjects, dt, dev_pos, dev_vel2);
+    glm::vec3* temp;
+    temp = dev_vel2;
+    dev_vel2 = dev_vel1;
+    dev_vel1 = temp;
+
+
+}
+
+__global__ void kernShufflePosVel(int N, int* particleArrayIndices,
+    glm::vec3* pos, glm::vec3* vel,
+    glm::vec3* posShuffled, glm::vec3* velShuffled) {
+
+    int index = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (index >= N) {
+        return;
+    }
+
+    int particleArrayIndex = particleArrayIndices[index];
+    
+    posShuffled[index] = pos[particleArrayIndex];
+    velShuffled[index] = vel[particleArrayIndex];
 }
 
+
 void Boids::stepSimulationCoherentGrid(float dt) {
   // TODO-2.3 - start by copying Boids::stepSimulationNaiveGrid
   // Uniform Grid Neighbor search using Thrust sort on cell-coherent data.
@@ -382,6 +829,38 @@ void Boids::stepSimulationCoherentGrid(float dt) {
   // - Perform velocity updates using neighbor search
   // - Update positions
   // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE.
+
+    dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+    dim3 numGrid((gridCellCount + blockSize - 1) / blockSize);
+
+    kernComputeIndices << <fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth,
+        dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+
+    thrust::device_ptr<int> dev_thrust_particleGridIndices(dev_particleGridIndices);
+    thrust::device_ptr<int> dev_thrust_particleArrayIndices(dev_particleArrayIndices);
+    thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
+
+    kernResetIntBuffer << <numGrid, blockSize >> > (gridCellCount, dev_gridCellStartIndices, -1);
+    kernResetIntBuffer << <numGrid, blockSize >> > (gridCellCount, dev_gridCellEndIndices, -1);
+
+    kernIdentifyCellStartEnd << < fullBlocksPerGrid, blockSize >> > (numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+    
+
+
+    kernShufflePosVel << < fullBlocksPerGrid, blockSize >> > (numObjects, dev_particleArrayIndices, dev_pos, dev_vel1, dev_posShuffle, dev_velShuffle);
+    cudaMemcpy(dev_pos, dev_posShuffle, numObjects * sizeof(glm::vec3), cudaMemcpyDeviceToDevice);
+    cudaMemcpy(dev_vel1, dev_velShuffle, numObjects * sizeof(glm::vec3), cudaMemcpyDeviceToDevice);
+    kernUpdateVelNeighborSearchCoherent << <fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth,
+        dev_gridCellStartIndices, dev_gridCellEndIndices,
+        dev_pos, dev_vel1, dev_vel2);
+    kernUpdatePos << < fullBlocksPerGrid, blockSize >> > (numObjects, dt, dev_pos, dev_vel2);
+    glm::vec3* temp;
+    temp = dev_vel2;
+    dev_vel2 = dev_vel1;
+    dev_vel1 = temp;
+
+
+
 }
 
 void Boids::endSimulation() {
@@ -390,6 +869,14 @@ void Boids::endSimulation() {
   cudaFree(dev_pos);
 
   // TODO-2.1 TODO-2.3 - Free any additional buffers here.
+
+  cudaFree(dev_particleGridIndices);
+  cudaFree(dev_particleArrayIndices);
+  cudaFree(dev_gridCellStartIndices);
+  cudaFree(dev_gridCellEndIndices);
+
+  cudaFree(dev_posShuffle);
+  cudaFree(dev_velShuffle);
 }
 
 void Boids::unitTest() {
diff --git a/src/main.cpp b/src/main.cpp
index b82c8c6..a7b8586 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -18,7 +18,7 @@
 #define COHERENT_GRID 0
 
 // LOOK-1.2 - change this to adjust particle count in the simulation
-const int N_FOR_VIS = 5000;
+const int N_FOR_VIS = 10000;
 const float DT = 0.2f;
 
 /**