diff --git a/CMakeSettings.json b/CMakeSettings.json
new file mode 100644
index 0000000..9204f06
--- /dev/null
+++ b/CMakeSettings.json
@@ -0,0 +1,15 @@
+﻿{
+  "configurations": [
+    {
+      "name": "x64-Debug",
+      "generator": "Ninja",
+      "configurationType": "Debug",
+      "inheritEnvironments": [ "msvc_x64_x64" ],
+      "buildRoot": "${projectDir}\\out\\build\\${name}",
+      "installRoot": "${projectDir}\\out\\install\\${name}",
+      "cmakeCommandArgs": "",
+      "buildCommandArgs": "",
+      "ctestCommandArgs": ""
+    }
+  ]
+}
\ No newline at end of file
diff --git a/README.md b/README.md
index d63a6a1..2e40e73 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,46 @@
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture,
 Project 1 - Flocking**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Zirui Zang
+  * [LinkedIn](https://www.linkedin.com/in/zirui-zang/)
+* Tested on: Windows 10, AMD Ryzen 7 3700X @ 3.60GHz 32GB, RTX2070 SUPER 8GB (Personal)
 
-### (TODO: Your README)
+### YouTube Demos
+These demos have 3 million boids. This code can run 5 million boids at 30+fps.
+[Demo 1](https://youtu.be/Kh1WCxn45gk)
+[Demo 2](https://youtu.be/M3_JNL-yDTA)
+[Demo 3](https://youtu.be/cQeCt1B_nHg)
 
-Include screenshots, analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+### Screenshots
+These screenshots are cpatured with 5e5 particles simulation.
+
+Part 1: Naive method (Brute force searching)
+![Part 1](images/1-500000.png)
+
+Part 2.1: Grid method (Grid searching)
+![Part 2.1](images/2-500000.png)
+
+Part 2.1: Coherent Grid method (Enhanced memory access)
+![Part 2.2](images/3-500000.png)
+
+### Gifs
+20000 Particles
+![20000](images/20000.gif)
+
+50000 Particles
+![50000](images/50000.gif)
+
+1000000 Particles
+![1000000](images/100000.gif)
+
+### Performance Summary
+5 Methods' FPS with Increasing # of Boids
+![chart](images/chart.png)
+
+3 Methods' FPS with Increasing Blacksize
+![chart](images/chart2.png)
+
+1. For each implementation, increasing number of boid cause decrease in FPS. There are more particles to calculate also more traffic at the memory.
+2. Per my experiments, changing block size does not affect FPS. This is probably because large block size results more synchronized computation, which is not shown in this project.
+3. Changing to coherent uniform grid does improve the performance with 5e5 or more particles, but not improving with 1e5 or less particles. This is because the benefit of having aligned memory access can only become larger than the cost of sorted the list when the particles base is large enough.
+4. Changing grid width and using 27 neighoring cells actually increase the performance. This is more prominent in the grid method than in the coherent grid method. This is because having smaller checking range leads to fewer memory accesses, which is more important in the grid implementation than in coherent implementation.
diff --git a/images/1-500000.png b/images/1-500000.png
new file mode 100644
index 0000000..f3e3764
Binary files /dev/null and b/images/1-500000.png differ
diff --git a/images/100000.gif b/images/100000.gif
new file mode 100644
index 0000000..7f91369
Binary files /dev/null and b/images/100000.gif differ
diff --git a/images/2-500000.png b/images/2-500000.png
new file mode 100644
index 0000000..de9b3f8
Binary files /dev/null and b/images/2-500000.png differ
diff --git a/images/20000.gif b/images/20000.gif
new file mode 100644
index 0000000..d1a4919
Binary files /dev/null and b/images/20000.gif differ
diff --git a/images/3-500000.png b/images/3-500000.png
new file mode 100644
index 0000000..6e25a1d
Binary files /dev/null and b/images/3-500000.png differ
diff --git a/images/50000.gif b/images/50000.gif
new file mode 100644
index 0000000..764fe38
Binary files /dev/null and b/images/50000.gif differ
diff --git a/images/chart.png b/images/chart.png
new file mode 100644
index 0000000..8be8cf8
Binary files /dev/null and b/images/chart.png differ
diff --git a/images/chart2.png b/images/chart2.png
new file mode 100644
index 0000000..574daa9
Binary files /dev/null and b/images/chart2.png differ
diff --git a/src/kernel.cu b/src/kernel.cu
index 74dffcb..cc1bebc 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -38,6 +38,8 @@ void checkCUDAError(const char *msg, int line = -1) {
 
 /*! Block size used for CUDA kernel launch. */
 #define blockSize 128
+//#define blockSize 512
+//#define blockSize 1024
 
 // LOOK-1.2 Parameters for the boids algorithm.
 // These worked well in our reference implementation.
@@ -45,6 +47,10 @@ void checkCUDAError(const char *msg, int line = -1) {
 #define rule2Distance 3.0f
 #define rule3Distance 5.0f
 
+//#define rule1Distance 25.0f
+//#define rule2Distance 9.0f
+//#define rule3Distance 25.0f
+
 #define rule1Scale 0.01f
 #define rule2Scale 0.1f
 #define rule3Scale 0.1f
@@ -52,7 +58,7 @@ void checkCUDAError(const char *msg, int line = -1) {
 #define maxSpeed 1.0f
 
 /*! Size of the starting area in simulation space. */
-#define scene_scale 100.0f
+#define scene_scale 200.0f
 
 /***********************************************
 * Kernel state (pointers are device pointers) *
@@ -80,6 +86,12 @@ int *dev_particleGridIndices; // What grid cell is this particle in?
 thrust::device_ptr<int> dev_thrust_particleArrayIndices;
 thrust::device_ptr<int> dev_thrust_particleGridIndices;
 
+// for 2.3
+thrust::device_ptr<int> dev_thrust_pos;
+thrust::device_ptr<int> dev_thrust_vel1;
+thrust::device_ptr<int> dev_thrust_vel2;
+int* dev_particleGridIndices_back;
+
 int *dev_gridCellStartIndices; // What part of dev_particleArrayIndices belongs
 int *dev_gridCellEndIndices;   // to this cell?
 
@@ -93,6 +105,8 @@ int gridSideCount;
 float gridCellWidth;
 float gridInverseCellWidth;
 glm::vec3 gridMinimum;
+float dev_time_ms = 0.0;
+float dev_cell_scale;
 
 /******************
 * initSimulation *
@@ -152,10 +166,13 @@ void Boids::initSimulation(int N) {
   checkCUDAErrorWithLine("cudaMalloc dev_vel2 failed!");
 
   // LOOK-1.2 - This is a typical CUDA kernel invocation.
-  kernGenerateRandomPosArray<<<fullBlocksPerGrid, blockSize>>>(1, numObjects,
-    dev_pos, scene_scale);
+  kernGenerateRandomPosArray<<<fullBlocksPerGrid, blockSize>>>(1, numObjects, dev_pos, scene_scale);
   checkCUDAErrorWithLine("kernGenerateRandomPosArray failed!");
 
+  // Initialize with random velocity
+  //kernGenerateRandomPosArray<<<fullBlocksPerGrid, blockSize>>>(2, numObjects, dev_vel1, maxSpeed * 0.5);
+  //checkCUDAErrorWithLine("kernGenerateRandomPosArray failed!");
+
   // LOOK-2.1 computing grid params
   gridCellWidth = 2.0f * std::max(std::max(rule1Distance, rule2Distance), rule3Distance);
   int halfSideCount = (int)(scene_scale / gridCellWidth) + 1;
@@ -169,6 +186,22 @@ void Boids::initSimulation(int N) {
   gridMinimum.z -= halfGridWidth;
 
   // TODO-2.1 TODO-2.3 - Allocate additional buffers here.
+  cudaMalloc((void**)&dev_particleArrayIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleArrayIndices failed!");
+
+  cudaMalloc((void**)&dev_particleGridIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleGridIndices failed!");
+
+  cudaMalloc((void**)&dev_gridCellStartIndices, gridCellCount * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellStartIndices failed!");
+  cudaMalloc((void**)&dev_gridCellEndIndices, gridCellCount * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellEndIndices failed!");
+
+  cudaMalloc((void**)&dev_particleGridIndices_back, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleGridIndices_back failed!");
+
+  cudaMalloc((void**)&dev_time_ms, sizeof(float));
+  checkCUDAErrorWithLine("cudaMalloc dev_time_ms failed!");
   cudaDeviceSynchronize();
 }
 
@@ -233,18 +266,68 @@ __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *po
   // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
   // Rule 2: boids try to stay a distance d away from each other
   // Rule 3: boids try to match the speed of surrounding boids
-  return glm::vec3(0.0f, 0.0f, 0.0f);
+
+    glm::vec3 return_velocity = glm::vec3(0.0f, 0.0f, 0.0f);
+    glm::vec3 this_pos = pos[iSelf];
+    glm::vec3 center_of_mass = glm::vec3(0.0f, 0.0f, 0.0f);
+    glm::vec3 close_pos_sum = glm::vec3(0.0f, 0.0f, 0.0f);
+    glm::vec3 close_boid_vel = glm::vec3(0.0f, 0.0f, 0.0f);
+    float rule1_boid_count = 0.0;
+    float rule3_boid_count = 0.0;
+    
+    for (int boid_ind = 0; boid_ind < N; boid_ind++) {
+        if (boid_ind != iSelf) {
+            glm::vec3 boid_pos = pos[boid_ind];
+            float distance = glm::length(boid_pos - this_pos);
+
+            // rule 1
+            if (distance < rule1Distance) {
+                rule1_boid_count = rule1_boid_count + 1.0;
+                center_of_mass = center_of_mass + boid_pos;
+            }
+
+            // rule 2
+            if (distance < rule2Distance) {
+                close_pos_sum = close_pos_sum - (boid_pos - this_pos);
+            }
+
+            // rule 3
+            if (distance < rule3Distance) {
+                rule3_boid_count = rule3_boid_count + 1.0;
+                close_boid_vel = close_boid_vel + vel[boid_ind];
+            }
+        }
+    }
+    if (rule1_boid_count != 0) {
+        center_of_mass = center_of_mass / rule1_boid_count;
+        return_velocity = return_velocity + (center_of_mass - this_pos) * rule1Scale;
+    }
+    return_velocity = return_velocity + close_pos_sum * rule2Scale;
+    if (rule3_boid_count != 0) {
+        close_boid_vel = close_boid_vel / rule3_boid_count;
+        return_velocity = return_velocity + close_boid_vel * rule3Scale;
+    }
+    return_velocity = return_velocity + vel[iSelf];
+    if (glm::length(return_velocity) > 1) {
+        return_velocity = glm::normalize(return_velocity) * maxSpeed;
+    }
+    return return_velocity;
 }
 
 /**
 * TODO-1.2 implement basic flocking
 * For each of the `N` bodies, update its position based on its current velocity.
 */
-__global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos,
-  glm::vec3 *vel1, glm::vec3 *vel2) {
+__global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos, glm::vec3* vel1, glm::vec3* vel2) {
   // Compute a new velocity based on pos and vel1
   // Clamp the speed
   // Record the new velocity into vel2. Question: why NOT vel1?
+
+    int index = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (index >= N) {
+        return;
+    }
+    vel2[index] = computeVelocityChange(N, index, pos, vel1);
 }
 
 /**
@@ -257,19 +340,19 @@ __global__ void kernUpdatePos(int N, float dt, glm::vec3 *pos, glm::vec3 *vel) {
   if (index >= N) {
     return;
   }
-  glm::vec3 thisPos = pos[index];
-  thisPos += vel[index] * dt;
+  glm::vec3 this_pos = pos[index];
+  this_pos += vel[index] * dt;
 
   // Wrap the boids around so we don't lose them
-  thisPos.x = thisPos.x < -scene_scale ? scene_scale : thisPos.x;
-  thisPos.y = thisPos.y < -scene_scale ? scene_scale : thisPos.y;
-  thisPos.z = thisPos.z < -scene_scale ? scene_scale : thisPos.z;
+  this_pos.x = this_pos.x < -scene_scale ? scene_scale : this_pos.x;
+  this_pos.y = this_pos.y < -scene_scale ? scene_scale : this_pos.y;
+  this_pos.z = this_pos.z < -scene_scale ? scene_scale : this_pos.z;
 
-  thisPos.x = thisPos.x > scene_scale ? -scene_scale : thisPos.x;
-  thisPos.y = thisPos.y > scene_scale ? -scene_scale : thisPos.y;
-  thisPos.z = thisPos.z > scene_scale ? -scene_scale : thisPos.z;
+  this_pos.x = this_pos.x > scene_scale ? -scene_scale : this_pos.x;
+  this_pos.y = this_pos.y > scene_scale ? -scene_scale : this_pos.y;
+  this_pos.z = this_pos.z > scene_scale ? -scene_scale : this_pos.z;
 
-  pos[index] = thisPos;
+  pos[index] = this_pos;
 }
 
 // LOOK-2.1 Consider this method of computing a 1D index from a 3D grid index.
@@ -289,6 +372,32 @@ __global__ void kernComputeIndices(int N, int gridResolution,
     // - Label each boid with the index of its grid cell.
     // - Set up a parallel array of integer indices as pointers to the actual
     //   boid data in pos and vel1/vel2
+    int index = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (index > N) {
+        return;
+    }
+    glm::vec3 this_pos_from_edge = pos[index] - gridMin;
+    indices[index] = index;
+    gridIndices[index] = gridIndex3Dto1D(this_pos_from_edge.x * inverseCellWidth,
+        this_pos_from_edge.y * inverseCellWidth,
+        this_pos_from_edge.z * inverseCellWidth, gridResolution);
+}
+
+__global__ void kernComputeIndices2(int N, int gridResolution,
+    glm::vec3 gridMin, float inverseCellWidth,
+    glm::vec3* pos, int* gridIndices) {
+    // TODO-2.1
+    // - Label each boid with the index of its grid cell.
+    // - Set up a parallel array of integer indices as pointers to the actual
+    //   boid data in pos and vel1/vel2
+    int index = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (index > N) {
+        return;
+    }
+    glm::vec3 this_pos_from_edge = pos[index] - gridMin;
+    gridIndices[index] = gridIndex3Dto1D(this_pos_from_edge.x * inverseCellWidth,
+        this_pos_from_edge.y * inverseCellWidth,
+        this_pos_from_edge.z * inverseCellWidth, gridResolution);
 }
 
 // LOOK-2.1 Consider how this could be useful for indicating that a cell
@@ -306,22 +415,304 @@ __global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices,
   // Identify the start point of each cell in the gridIndices array.
   // This is basically a parallel unrolling of a loop that goes
   // "this index doesn't match the one before it, must be a new cell!"
+    int index = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (index >= N) {
+        return;
+    }
+    int grid_ind = particleGridIndices[index];
+    if (index == 0) {
+        gridCellStartIndices[grid_ind] = index;
+    }
+    else {
+        int pre_grid_ind = particleGridIndices[index - 1];
+        if (grid_ind != pre_grid_ind) {
+            gridCellEndIndices[pre_grid_ind] = index;
+            gridCellStartIndices[grid_ind] = index;
+        }
+    }
+}
+
+__device__ int* find_8_neighbors(float inverseCellWidth, int gridResolution, glm::vec3 this_pos_from_edge) {
+    // find the neighbors
+    float probe_distance = imax(imax(rule1Distance, rule2Distance), rule3Distance);
+    int self_xyzs[3];
+    int neighbor_xyzs[3] = { -1 };
+    int neighbor_grid_inds[8] = { -1 };
+    self_xyzs[0] = this_pos_from_edge.x * inverseCellWidth;
+    self_xyzs[1] = this_pos_from_edge.y * inverseCellWidth;
+    self_xyzs[2] = this_pos_from_edge.z * inverseCellWidth;
+
+    // find the xyz to combine for the 8 neighbors
+    int probe_grid_ind = (this_pos_from_edge.x + probe_distance) * inverseCellWidth; // +x
+    if (probe_grid_ind != self_xyzs[0] && probe_grid_ind >= 0 && probe_grid_ind < gridResolution) {
+        neighbor_xyzs[0] = probe_grid_ind;
+    }
+    else {
+        probe_grid_ind = (this_pos_from_edge.x - probe_distance) * inverseCellWidth; // -x
+        if (probe_grid_ind != self_xyzs[0] && probe_grid_ind >= 0 && probe_grid_ind < gridResolution) {
+            neighbor_xyzs[0] = probe_grid_ind;
+        }
+    }
+
+    probe_grid_ind = (this_pos_from_edge.y + probe_distance) * inverseCellWidth; // +y
+    if (probe_grid_ind != self_xyzs[1] && probe_grid_ind >= 0 && probe_grid_ind < gridResolution) {
+        neighbor_xyzs[1] = probe_grid_ind;
+    }
+    else {
+        probe_grid_ind = (this_pos_from_edge.y - probe_distance) * inverseCellWidth; // -y
+        if (probe_grid_ind != self_xyzs[1] && probe_grid_ind >= 0 && probe_grid_ind < gridResolution) {
+            neighbor_xyzs[1] = probe_grid_ind;
+        }
+    }
+
+    probe_grid_ind = (this_pos_from_edge.z + probe_distance) * inverseCellWidth; // +z
+    if (probe_grid_ind != self_xyzs[2] && probe_grid_ind >= 0 && probe_grid_ind < gridResolution) {
+        neighbor_xyzs[2] = probe_grid_ind;
+    }
+    else {
+        probe_grid_ind = (this_pos_from_edge.z - probe_distance) * inverseCellWidth; // -z
+        if (probe_grid_ind != self_xyzs[2] && probe_grid_ind >= 0 && probe_grid_ind < gridResolution) {
+            neighbor_xyzs[2] = probe_grid_ind;
+        }
+    }
+
+    // assemble the 8 neighbors
+    neighbor_grid_inds[0] = gridIndex3Dto1D(self_xyzs[0], self_xyzs[1], self_xyzs[2], gridResolution);
+    if (neighbor_xyzs[0] != -1) {
+        neighbor_grid_inds[1] = gridIndex3Dto1D(neighbor_xyzs[0], self_xyzs[1], self_xyzs[2], gridResolution);
+
+        if (neighbor_xyzs[1] != -1) {
+            neighbor_grid_inds[2] = gridIndex3Dto1D(neighbor_xyzs[0], neighbor_xyzs[1], self_xyzs[2], gridResolution);
+            if (neighbor_xyzs[2] != -1) {
+                neighbor_grid_inds[4] = gridIndex3Dto1D(neighbor_xyzs[0], neighbor_xyzs[1], neighbor_xyzs[2], gridResolution);
+            }
+        }
+
+        if (neighbor_xyzs[2] != -1) {
+            neighbor_grid_inds[3] = gridIndex3Dto1D(neighbor_xyzs[0], self_xyzs[1], neighbor_xyzs[2], gridResolution);
+        }
+    }
+    if (neighbor_xyzs[1] != -1) {
+        neighbor_grid_inds[5] = gridIndex3Dto1D(self_xyzs[0], neighbor_xyzs[1], self_xyzs[2], gridResolution);
+        if (neighbor_xyzs[2] != -1) {
+            neighbor_grid_inds[6] = gridIndex3Dto1D(self_xyzs[0], neighbor_xyzs[1], neighbor_xyzs[2], gridResolution);
+        }
+    }
+    if (neighbor_xyzs[2] != -1) {
+        neighbor_grid_inds[7] = gridIndex3Dto1D(self_xyzs[0], self_xyzs[1], neighbor_xyzs[2], gridResolution);
+    }
+    return neighbor_grid_inds;
+}
+
+__device__ int* find_27_neighbors(float inverseCellWidth, int gridResolution, glm::vec3 this_pos_from_edge) {
+    // find the neighbors
+    float probe_distance = imax(imax(rule1Distance, rule2Distance), rule3Distance);
+    int self_xyzs[3];
+    int neighbor_xyzs[6] = { -1 };
+    int neighbor_grid_inds[27] = { -1 };
+    self_xyzs[0] = this_pos_from_edge.x * inverseCellWidth;
+    self_xyzs[1] = this_pos_from_edge.y * inverseCellWidth;
+    self_xyzs[2] = this_pos_from_edge.z * inverseCellWidth;
+
+    // find the xyz to combine for the 27 neighbors
+    int probe_grid_ind = (this_pos_from_edge.x + probe_distance) * inverseCellWidth; // +x
+    if (probe_grid_ind != self_xyzs[0] && probe_grid_ind >= 0 && probe_grid_ind < gridResolution) {
+        neighbor_xyzs[0] = probe_grid_ind;
+    }
+    probe_grid_ind = (this_pos_from_edge.x - probe_distance) * inverseCellWidth; // -x
+    if (probe_grid_ind != self_xyzs[0] && probe_grid_ind >= 0 && probe_grid_ind < gridResolution) {
+        neighbor_xyzs[1] = probe_grid_ind;
+    }
+
+    probe_grid_ind = (this_pos_from_edge.y + probe_distance) * inverseCellWidth; // +y
+    if (probe_grid_ind != self_xyzs[1] && probe_grid_ind >= 0 && probe_grid_ind < gridResolution) {
+        neighbor_xyzs[2] = probe_grid_ind;
+    }
+    probe_grid_ind = (this_pos_from_edge.y - probe_distance) * inverseCellWidth; // -y
+    if (probe_grid_ind != self_xyzs[1] && probe_grid_ind >= 0 && probe_grid_ind < gridResolution) {
+        neighbor_xyzs[3] = probe_grid_ind;
+    }
+
+    probe_grid_ind = (this_pos_from_edge.z + probe_distance) * inverseCellWidth; // +z
+    if (probe_grid_ind != self_xyzs[2] && probe_grid_ind >= 0 && probe_grid_ind < gridResolution) {
+        neighbor_xyzs[4] = probe_grid_ind;
+    }
+    probe_grid_ind = (this_pos_from_edge.z - probe_distance) * inverseCellWidth; // -z
+    if (probe_grid_ind != self_xyzs[2] && probe_grid_ind >= 0 && probe_grid_ind < gridResolution) {
+        neighbor_xyzs[5] = probe_grid_ind;
+    }
+
+    // assemble the 27 neighbors
+    neighbor_grid_inds[0] = gridIndex3Dto1D(self_xyzs[0], self_xyzs[1], self_xyzs[2], gridResolution);
+    if (neighbor_xyzs[0] != -1) { // 9 in +x
+        neighbor_grid_inds[1] = gridIndex3Dto1D(neighbor_xyzs[0], self_xyzs[1], self_xyzs[2], gridResolution);
+
+        if (neighbor_xyzs[2] != -1) {
+            neighbor_grid_inds[2] = gridIndex3Dto1D(neighbor_xyzs[0], neighbor_xyzs[2], self_xyzs[2], gridResolution);
+            if (neighbor_xyzs[4] != -1) {
+                neighbor_grid_inds[3] = gridIndex3Dto1D(neighbor_xyzs[0], neighbor_xyzs[2], neighbor_xyzs[4], gridResolution);
+            }
+            if (neighbor_xyzs[5] != -1) {
+                neighbor_grid_inds[4] = gridIndex3Dto1D(neighbor_xyzs[0], neighbor_xyzs[2], neighbor_xyzs[5], gridResolution);
+            }
+        }
+
+        if (neighbor_xyzs[3] != -1) {
+            neighbor_grid_inds[5] = gridIndex3Dto1D(neighbor_xyzs[0], neighbor_xyzs[3], self_xyzs[2], gridResolution);
+            if (neighbor_xyzs[4] != -1) {
+                neighbor_grid_inds[6] = gridIndex3Dto1D(neighbor_xyzs[0], neighbor_xyzs[3], neighbor_xyzs[4], gridResolution);
+            }
+            if (neighbor_xyzs[5] != -1) {
+                neighbor_grid_inds[7] = gridIndex3Dto1D(neighbor_xyzs[0], neighbor_xyzs[3], neighbor_xyzs[5], gridResolution);
+            }
+        }
+
+        if (neighbor_xyzs[4] != -1) {
+            neighbor_grid_inds[8] = gridIndex3Dto1D(neighbor_xyzs[0], self_xyzs[1], neighbor_xyzs[4], gridResolution);
+        }
+        if (neighbor_xyzs[5] != -1) {
+            neighbor_grid_inds[9] = gridIndex3Dto1D(neighbor_xyzs[0], self_xyzs[1], neighbor_xyzs[5], gridResolution);
+        }
+    }
+
+    if (neighbor_xyzs[1] != -1) { // 9 in -x
+        neighbor_grid_inds[10] = gridIndex3Dto1D(neighbor_xyzs[1], self_xyzs[1], self_xyzs[2], gridResolution);
+
+        if (neighbor_xyzs[2] != -1) {
+            neighbor_grid_inds[11] = gridIndex3Dto1D(neighbor_xyzs[1], neighbor_xyzs[2], self_xyzs[2], gridResolution);
+            if (neighbor_xyzs[4] != -1) {
+                neighbor_grid_inds[12] = gridIndex3Dto1D(neighbor_xyzs[1], neighbor_xyzs[2], neighbor_xyzs[4], gridResolution);
+            }
+            if (neighbor_xyzs[5] != -1) {
+                neighbor_grid_inds[13] = gridIndex3Dto1D(neighbor_xyzs[1], neighbor_xyzs[2], neighbor_xyzs[5], gridResolution);
+            }
+        }
+
+        if (neighbor_xyzs[3] != -1) {
+            neighbor_grid_inds[14] = gridIndex3Dto1D(neighbor_xyzs[1], neighbor_xyzs[3], self_xyzs[2], gridResolution);
+            if (neighbor_xyzs[4] != -1) {
+                neighbor_grid_inds[15] = gridIndex3Dto1D(neighbor_xyzs[1], neighbor_xyzs[3], neighbor_xyzs[4], gridResolution);
+            }
+            if (neighbor_xyzs[5] != -1) {
+                neighbor_grid_inds[16] = gridIndex3Dto1D(neighbor_xyzs[1], neighbor_xyzs[3], neighbor_xyzs[5], gridResolution);
+            }
+        }
+
+        if (neighbor_xyzs[4] != -1) {
+            neighbor_grid_inds[17] = gridIndex3Dto1D(neighbor_xyzs[1], self_xyzs[1], neighbor_xyzs[4], gridResolution);
+        }
+        if (neighbor_xyzs[5] != -1) {
+            neighbor_grid_inds[18] = gridIndex3Dto1D(neighbor_xyzs[1], self_xyzs[1], neighbor_xyzs[5], gridResolution);
+        }
+    }
+
+    if (neighbor_xyzs[2] != -1) { // 3 in +y
+        neighbor_grid_inds[19] = gridIndex3Dto1D(self_xyzs[0], neighbor_xyzs[2], self_xyzs[2], gridResolution);
+        if (neighbor_xyzs[4] != -1) {
+            neighbor_grid_inds[20] = gridIndex3Dto1D(self_xyzs[0], neighbor_xyzs[2], neighbor_xyzs[4], gridResolution);
+        }
+        if (neighbor_xyzs[5] != -1) {
+            neighbor_grid_inds[21] = gridIndex3Dto1D(self_xyzs[0], neighbor_xyzs[2], neighbor_xyzs[5], gridResolution);
+        }
+    }
+
+    if (neighbor_xyzs[3] != -1) { // 3 in -y
+        neighbor_grid_inds[22] = gridIndex3Dto1D(self_xyzs[0], neighbor_xyzs[3], self_xyzs[2], gridResolution);
+        if (neighbor_xyzs[4] != -1) {
+            neighbor_grid_inds[23] = gridIndex3Dto1D(self_xyzs[0], neighbor_xyzs[3], neighbor_xyzs[4], gridResolution);
+        }
+        if (neighbor_xyzs[5] != -1) {
+            neighbor_grid_inds[24] = gridIndex3Dto1D(self_xyzs[0], neighbor_xyzs[3], neighbor_xyzs[5], gridResolution);
+        }
+    }
+
+    if (neighbor_xyzs[4] != -1) { // 1 in +z
+        neighbor_grid_inds[25] = gridIndex3Dto1D(self_xyzs[0], self_xyzs[1], neighbor_xyzs[4], gridResolution);
+    }
+    if (neighbor_xyzs[5] != -1) { // 1 in -z
+        neighbor_grid_inds[26] = gridIndex3Dto1D(self_xyzs[0], self_xyzs[1], neighbor_xyzs[5], gridResolution);
+    }
+
+    return neighbor_grid_inds;
 }
 
 __global__ void kernUpdateVelNeighborSearchScattered(
-  int N, int gridResolution, glm::vec3 gridMin,
-  float inverseCellWidth, float cellWidth,
-  int *gridCellStartIndices, int *gridCellEndIndices,
-  int *particleArrayIndices,
-  glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) {
-  // TODO-2.1 - Update a boid's velocity using the uniform grid to reduce
-  // the number of boids that need to be checked.
-  // - Identify the grid cell that this particle is in
-  // - Identify which cells may contain neighbors. This isn't always 8.
-  // - For each cell, read the start/end indices in the boid pointer array.
-  // - Access each boid in the cell and compute velocity change from
-  //   the boids rules, if this boid is within the neighborhood distance.
-  // - Clamp the speed change before putting the new speed in vel2
+    int N, int gridResolution, glm::vec3 gridMin,
+    float inverseCellWidth, float cellWidth,
+    int* gridCellStartIndices, int* gridCellEndIndices,
+    int* particleArrayIndices,
+    glm::vec3* pos, glm::vec3* vel1, glm::vec3* vel2) {
+    // TODO-2.1 - Update a boid's velocity using the uniform grid to reduce
+    // the number of boids that need to be checked.
+    // - Identify the grid cell that this particle is in
+    // - Identify which cells may contain neighbors. This isn't always 8.
+    // - For each cell, read the start/end indices in the boid pointer array.
+    // - Access each boid in the cell and compute velocity change from
+    //   the boids rules, if this boid is within the neighborhood distance.
+    // - Clamp the speed change before putting the new speed in vel2
+
+    int index = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (index >= N) {
+        return;
+    }
+    glm::vec3 this_pos = pos[index];
+    glm::vec3 this_pos_from_edge = this_pos - gridMin;
+    int* neighbor_grid_inds;
+    neighbor_grid_inds = find_8_neighbors(inverseCellWidth, gridResolution, this_pos_from_edge);
+    //neighbor_grid_inds = find_27_neighbors(inverseCellWidth, gridResolution, this_pos_from_edge);
+    
+
+    // find all velcities
+    glm::vec3 return_velocity = glm::vec3(0.0f, 0.0f, 0.0f);
+    glm::vec3 center_of_mass = glm::vec3(0.0f, 0.0f, 0.0f);
+    glm::vec3 close_pos_sum = glm::vec3(0.0f, 0.0f, 0.0f);
+    glm::vec3 close_boid_vel = glm::vec3(0.0f, 0.0f, 0.0f);
+    float rule1_boid_count = 0.0;
+    float rule3_boid_count = 0.0;
+    for (int neighbor_grid_list_ind = 0; neighbor_grid_list_ind < 8; neighbor_grid_list_ind++) {
+        int neighbor_grid_ind = neighbor_grid_inds[neighbor_grid_list_ind];
+        if (neighbor_grid_ind != -1) {
+            for (int boid_in_grid_ind = gridCellStartIndices[neighbor_grid_ind];
+                boid_in_grid_ind < gridCellEndIndices[neighbor_grid_ind]; boid_in_grid_ind++) {
+                int boid_ind = particleArrayIndices[boid_in_grid_ind];
+                if (boid_ind != index) {
+                    glm::vec3 boid_pos = pos[boid_ind];
+                    float distance = glm::length(boid_pos - this_pos);
+
+                    // rule 1
+                    if (distance < rule1Distance) {
+                        rule1_boid_count = rule1_boid_count + 1.0;
+                        center_of_mass = center_of_mass + boid_pos;
+                    }
+
+                    // rule 2
+                    if (distance < rule2Distance) {
+                        close_pos_sum = close_pos_sum - (boid_pos - this_pos);
+                    }
+
+                    // rule 3
+                    if (distance < rule3Distance) {
+                        rule3_boid_count = rule3_boid_count + 1.0;
+                        close_boid_vel = close_boid_vel + vel1[boid_ind];
+                    }
+                }
+            }
+        }
+    }
+    if (rule1_boid_count != 0) {
+        center_of_mass = center_of_mass / rule1_boid_count;
+        return_velocity = return_velocity + (center_of_mass - this_pos) * rule1Scale;
+    }
+    return_velocity = return_velocity + close_pos_sum * rule2Scale;
+    if (rule3_boid_count != 0) {
+        close_boid_vel = close_boid_vel / rule3_boid_count;
+        return_velocity = return_velocity + close_boid_vel * rule3Scale;
+    }
+    return_velocity = return_velocity + vel1[index];
+    if (glm::length(return_velocity) > 1) {
+        return_velocity = glm::normalize(return_velocity) * maxSpeed;
+    }
+    vel2[index] = return_velocity;
 }
 
 __global__ void kernUpdateVelNeighborSearchCoherent(
@@ -341,6 +732,66 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
   // - Access each boid in the cell and compute velocity change from
   //   the boids rules, if this boid is within the neighborhood distance.
   // - Clamp the speed change before putting the new speed in vel2
+    int index = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (index >= N) {
+        return;
+    }
+    glm::vec3 this_pos = pos[index];
+    glm::vec3 this_pos_from_edge = this_pos - gridMin;
+    int* neighbor_grid_inds;
+    neighbor_grid_inds = find_8_neighbors(inverseCellWidth, gridResolution, this_pos_from_edge);
+    //neighbor_grid_inds = find_27_neighbors(inverseCellWidth, gridResolution, this_pos_from_edge);
+
+    // find all velcities
+    glm::vec3 return_velocity = glm::vec3(0.0f, 0.0f, 0.0f);
+    glm::vec3 center_of_mass = glm::vec3(0.0f, 0.0f, 0.0f);
+    glm::vec3 close_pos_sum = glm::vec3(0.0f, 0.0f, 0.0f);
+    glm::vec3 close_boid_vel = glm::vec3(0.0f, 0.0f, 0.0f);
+    float rule1_boid_count = 0.0;
+    float rule3_boid_count = 0.0;
+    for (int neighbor_grid_list_ind = 0; neighbor_grid_list_ind < 8; neighbor_grid_list_ind++) {
+        int neighbor_grid_ind = neighbor_grid_inds[neighbor_grid_list_ind];
+        if (neighbor_grid_ind != -1) {
+            for (int boid_in_grid_ind = gridCellStartIndices[neighbor_grid_ind];
+                boid_in_grid_ind < gridCellEndIndices[neighbor_grid_ind]; boid_in_grid_ind++) {
+                if (boid_in_grid_ind != index) {
+                    glm::vec3 boid_pos = pos[boid_in_grid_ind];
+                    float distance = glm::length(boid_pos - this_pos);
+
+                    // rule 1
+                    if (distance < rule1Distance) {
+                        rule1_boid_count = rule1_boid_count + 1.0;
+                        center_of_mass = center_of_mass + boid_pos;
+                    }
+
+                    // rule 2
+                    if (distance < rule2Distance) {
+                        close_pos_sum = close_pos_sum - (boid_pos - this_pos);
+                    }
+
+                    // rule 3
+                    if (distance < rule3Distance) {
+                        rule3_boid_count = rule3_boid_count + 1.0;
+                        close_boid_vel = close_boid_vel + vel1[boid_in_grid_ind];
+                    }
+                }
+            }
+        }
+    }
+    if (rule1_boid_count != 0) {
+        center_of_mass = center_of_mass / rule1_boid_count;
+        return_velocity = return_velocity + (center_of_mass - this_pos) * rule1Scale;
+    }
+    return_velocity = return_velocity + close_pos_sum * rule2Scale;
+    if (rule3_boid_count != 0) {
+        close_boid_vel = close_boid_vel / rule3_boid_count;
+        return_velocity = return_velocity + close_boid_vel * rule3Scale;
+    }
+    return_velocity = return_velocity + vel1[index];
+    if (glm::length(return_velocity) > 1) {
+        return_velocity = glm::normalize(return_velocity) * maxSpeed;
+    }
+    vel2[index] = return_velocity;
 }
 
 /**
@@ -349,6 +800,16 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
 void Boids::stepSimulationNaive(float dt) {
   // TODO-1.2 - use the kernels you wrote to step the simulation forward in time.
   // TODO-1.2 ping-pong the velocity buffers
+    dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+
+    kernUpdateVelocityBruteForce << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_pos, dev_vel1, dev_vel2);
+    checkCUDAErrorWithLine("kernUpdateVelocityBruteForce failed!");
+
+    cudaMemcpy(dev_vel1, dev_vel2, sizeof(glm::vec3) * numObjects, cudaMemcpyDeviceToDevice);
+
+    kernUpdatePos << <fullBlocksPerGrid, blockSize >> > (numObjects, dt, dev_pos, dev_vel1);
+    checkCUDAErrorWithLine("kernUpdatePos failed!");
+
 }
 
 void Boids::stepSimulationScatteredGrid(float dt) {
@@ -364,6 +825,34 @@ void Boids::stepSimulationScatteredGrid(float dt) {
   // - Perform velocity updates using neighbor search
   // - Update positions
   // - Ping-pong buffers as needed
+
+    // compute grid_ind for each boid
+    dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+    kernComputeIndices << <fullBlocksPerGrid, blockSize >> > (
+        numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+    checkCUDAErrorWithLine("kernUpdateVelocityBruteForce failed!");
+
+    // sort the grid_ind-boid_ind by grid_ind
+    thrust::device_ptr<int> dev_thrust_particleArrayIndices(dev_particleArrayIndices);
+    thrust::device_ptr<int> dev_thrust_particleGridIndices(dev_particleGridIndices);
+    thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
+    
+    // find boid in each grid cell
+    kernIdentifyCellStartEnd << <fullBlocksPerGrid, blockSize >> > (
+        numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+    
+    // update pos and vels
+    kernUpdateVelNeighborSearchScattered << <fullBlocksPerGrid, blockSize >> > (
+        numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth,
+        dev_gridCellStartIndices, dev_gridCellEndIndices, dev_particleArrayIndices,
+        dev_pos, dev_vel1, dev_vel2);
+    checkCUDAErrorWithLine("kernUpdateVelNeighborSearchScattered failed!");
+
+    cudaMemcpy(dev_vel1, dev_vel2, sizeof(glm::vec3) * numObjects, cudaMemcpyDeviceToDevice);
+
+    kernUpdatePos << <fullBlocksPerGrid, blockSize >> > (numObjects, dt, dev_pos, dev_vel1);
+    checkCUDAErrorWithLine("kernUpdatePos failed!");
+
 }
 
 void Boids::stepSimulationCoherentGrid(float dt) {
@@ -382,11 +871,56 @@ void Boids::stepSimulationCoherentGrid(float dt) {
   // - Perform velocity updates using neighbor search
   // - Update positions
   // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE.
+    //cudaEvent_t start, stop;
+    //cudaEventCreate(&start);
+    //cudaEventCreate(&stop);
+    //cudaEventRecord(start);
+
+    // compute grid_ind for each boid
+    dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+    kernComputeIndices2 << <fullBlocksPerGrid, blockSize >> > (
+        numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, dev_pos, dev_particleGridIndices);
+    checkCUDAErrorWithLine("kernUpdateVelocityBruteForce failed!");
+
+    // sort the grid_ind-boid_ind by grid_ind
+    thrust::device_ptr<glm::vec3> dev_thrust_pos(dev_pos);
+    thrust::device_ptr<glm::vec3> dev_thrust_vel1(dev_vel1);
+    thrust::device_ptr<glm::vec3> dev_thrust_vel2(dev_vel2);
+    thrust::device_ptr<int> dev_thrust_particleGridIndices(dev_particleGridIndices);
+    cudaMemcpy(dev_particleGridIndices_back, dev_particleGridIndices, sizeof(int) * numObjects, cudaMemcpyDeviceToDevice);
+    thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_pos);
+    cudaMemcpy(dev_particleGridIndices, dev_particleGridIndices_back, sizeof(int) * numObjects, cudaMemcpyDeviceToDevice);
+    thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_vel1);
+    cudaMemcpy(dev_particleGridIndices, dev_particleGridIndices_back, sizeof(int) * numObjects, cudaMemcpyDeviceToDevice);
+    thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_vel2);
+
+    // find boid in each grid cell
+    kernIdentifyCellStartEnd << <fullBlocksPerGrid, blockSize >> > (
+        numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+
+    // update pos and vels
+    kernUpdateVelNeighborSearchCoherent << <fullBlocksPerGrid, blockSize >> > (
+        numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth,
+        dev_gridCellStartIndices, dev_gridCellEndIndices,
+        dev_pos, dev_vel1, dev_vel2);
+    checkCUDAErrorWithLine("kernUpdateVelNeighborSearchScattered failed!");
+
+    cudaMemcpy(dev_vel1, dev_vel2, sizeof(glm::vec3) * numObjects, cudaMemcpyDeviceToDevice);
+
+    kernUpdatePos << <fullBlocksPerGrid, blockSize >> > (numObjects, dt, dev_pos, dev_vel1);
+    checkCUDAErrorWithLine("kernUpdatePos failed!");
+
+    //cudaEventRecord(stop);
+    //cudaEventSynchronize(stop);
+    //float milliseconds = 0;
+    //cudaEventElapsedTime(&milliseconds, start, stop);
+    //dev_time_ms = dev_time_ms * 0.99 + milliseconds * 0.01;
+    //printf("%.6f /n", dev_time_ms);
 }
 
 void Boids::endSimulation() {
   cudaFree(dev_vel1);
-  cudaFree(dev_vel2);
+  cudaFree(dev_vel1);
   cudaFree(dev_pos);
 
   // TODO-2.1 TODO-2.3 - Free any additional buffers here.
diff --git a/src/main.cpp b/src/main.cpp
index b82c8c6..3de458b 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -14,11 +14,13 @@
 
 // LOOK-2.1 LOOK-2.3 - toggles for UNIFORM_GRID and COHERENT_GRID
 #define VISUALIZE 1
-#define UNIFORM_GRID 0
-#define COHERENT_GRID 0
+#define UNIFORM_GRID 1
+#define COHERENT_GRID 1
 
 // LOOK-1.2 - change this to adjust particle count in the simulation
-const int N_FOR_VIS = 5000;
+const int N_FOR_VIS = 100000;
+//const int N_FOR_VIS = 500000;
+//const int N_FOR_VIS = 20000;
 const float DT = 0.2f;
 
 /**
diff --git a/src/main.hpp b/src/main.hpp
index 88e9df7..438a072 100644
--- a/src/main.hpp
+++ b/src/main.hpp
@@ -36,8 +36,10 @@ const float fovy = (float) (PI / 4);
 const float zNear = 0.10f;
 const float zFar = 10.0f;
 // LOOK-1.2: for high DPI displays, you may want to double these settings.
-int width = 1280;
-int height = 720;
+//int width = 1920;
+//int height = 1080;
+int width = 3840;
+int height = 2160;
 int pointSize = 2;
 
 // For camera controls