diff --git a/README.md b/README.md
index d63a6a1..66f597b 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,35 @@
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture,
 Project 1 - Flocking**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Xinyu Lin
+[Linkedin](https://www.linkedin.com/in/xinyu-lin-138352125/)
+* Tested on: Windows 10, Intel(R) Core(TM) i7-6700HQ CPU@2.60GHz, 16GB, GTX960M(Private Computer)
 
-### (TODO: Your README)
+### Screenshot of flocking boids
 
-Include screenshots, analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+![](images/5.png)
+### Performance Analysis
+
+* Framerate change with increasing number of boids
+![](images/3.png)
+
+* Framerate change with increasing block size in 5000 boids
+![](images/4.png)
+
+### Questions
+1. For each implementation, how does changing the number of boids affect performance? Why do you think this is?
+
+* With the increasing number of boids, framerate and performance decrease. Because program needs to check each boid's neighborhoods and the times of checking increases.
+
+2. For each implementation, how does changing the block count and block size affect performance? Why do you think this is?
+
+* The block size and block cound does not affect perforance at all.
+
+3. For the coherent uniform grid: did you experience any performance improvements with the more coherent uniform grid? Was this the outcome you expected? Why or why not?
+
+* No, in my code, the coherent uniform grid is of the same speed as the uniform grid.
+  No, the performance of coherent uniform grid is expected to be better than uniform grid.
+
+4. Did changing cell width and checking 27 vs 8 neighboring cells affect performance? Why or why not? Be careful: it is insufficient (and possibly incorrect) to say that 27-cell is slower simply because there are more cells to check!
+
+* Yes, because the way we use to check 27 or 8 neighboring cells is a for-loop in each thread, so the increase of checking area will slow down the performance.
diff --git a/images/1.png b/images/1.png
new file mode 100644
index 0000000..6d9e932
Binary files /dev/null and b/images/1.png differ
diff --git a/images/2.png b/images/2.png
new file mode 100644
index 0000000..0c599f6
Binary files /dev/null and b/images/2.png differ
diff --git a/images/3.png b/images/3.png
new file mode 100644
index 0000000..55bb223
Binary files /dev/null and b/images/3.png differ
diff --git a/images/4.png b/images/4.png
new file mode 100644
index 0000000..ef434ea
Binary files /dev/null and b/images/4.png differ
diff --git a/images/5.png b/images/5.png
new file mode 100644
index 0000000..5733f63
Binary files /dev/null and b/images/5.png differ
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index fdd636d..dff0113 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -10,5 +10,5 @@ set(SOURCE_FILES
 
 cuda_add_library(src
     ${SOURCE_FILES}
-    OPTIONS -arch=sm_20
+    OPTIONS -arch=sm_50
     )
diff --git a/src/kernel.cu b/src/kernel.cu
index 74dffcb..5b55d30 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -86,6 +86,9 @@ int *dev_gridCellEndIndices;   // to this cell?
 // TODO-2.3 - consider what additional buffers you might need to reshuffle
 // the position and velocity data to be coherent within cells.
 
+glm::vec3* dev_coherentPos;
+glm::vec3 *dev_coherentVel1;
+
 // LOOK-2.1 - Grid parameters based on simulation parameters.
 // These are automatically computed for you in Boids::initSimulation
 int gridCellCount;
@@ -169,6 +172,38 @@ void Boids::initSimulation(int N) {
   gridMinimum.z -= halfGridWidth;
 
   // TODO-2.1 TODO-2.3 - Allocate additional buffers here.
+
+  //int *dev_particleArrayIndices; // What index in dev_pos and dev_velX represents this particle?
+  //int *dev_particleGridIndices; // What grid cell is this particle in?
+		//						// needed for use with thrust
+  //thrust::device_ptr<int> dev_thrust_particleArrayIndices;
+  //thrust::device_ptr<int> dev_thrust_particleGridIndices;
+
+  //int *dev_gridCellStartIndices; // What part of dev_particleArrayIndices belongs
+  //int *dev_gridCellEndIndices;   // to this cell?
+
+  cudaMalloc((void**)&dev_particleArrayIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleArrayIndices failed！");
+
+	cudaMalloc((void**)&dev_particleGridIndices, N * sizeof(int));
+	checkCUDAErrorWithLine("cudaMalloc dev_particleGridIndices failed!");
+
+	cudaMalloc((void**)&dev_gridCellStartIndices, gridCellCount * sizeof(int));
+	checkCUDAErrorWithLine("cudaMalloc dev_gridCellStartIndices failed!");
+
+	cudaMalloc((void**)&dev_gridCellEndIndices, gridCellCount * sizeof(int));
+	checkCUDAErrorWithLine("cudaMalloc dev_gridCellEndIndices failed!");
+
+
+	cudaMalloc((void**)&dev_coherentPos, N * sizeof(glm::vec3));
+	checkCUDAErrorWithLine("cudaMalloc dev_coherentPos failed!");
+
+	cudaMalloc((void**)&dev_coherentVel1, N * sizeof(glm::vec3));
+	checkCUDAErrorWithLine("cudaMalloc dev_coherentVel1 failed!");
+
+	dev_thrust_particleArrayIndices = thrust::device_pointer_cast<int>(dev_particleArrayIndices);
+	dev_thrust_particleGridIndices = thrust::device_pointer_cast<int>(dev_particleGridIndices);
+
   cudaDeviceSynchronize();
 }
 
@@ -225,15 +260,64 @@ void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities)
 
 /**
 * LOOK-1.2 You can use this as a helper for kernUpdateVelocityBruteForce.
-* __device__ code can be called from a __global__ context
+* __device__ code can be called from a __global__ context !!!!
 * Compute the new velocity on the body with index `iSelf` due to the `N` boids
 * in the `pos` and `vel` arrays.
 */
 __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *pos, const glm::vec3 *vel) {
   // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
   // Rule 2: boids try to stay a distance d away from each other
-  // Rule 3: boids try to match the speed of surrounding boids
-  return glm::vec3(0.0f, 0.0f, 0.0f);
+	glm::vec3 perceived_center = glm::vec3(0.0f, 0.0f, 0.0f);
+	glm::vec3 c = glm::vec3(0.0f, 0.0f, 0.0f);
+	glm::vec3 perceived_velocity = glm::vec3(0.0f, 0.0f, 0.0f);
+	int flag1, flag2 = 0;
+
+	for (int i = 0; i < N; ++i)
+	{
+		if (i != iSelf)
+		{
+			float m_distance = glm::distance(pos[i], pos[iSelf]);
+			if (m_distance < rule1Distance)
+			{
+				perceived_center += pos[i];
+				++flag1;
+			}
+
+			if (m_distance < rule2Distance)
+			{
+				c -= (pos[i] - pos[iSelf]);
+			}
+
+			if (m_distance < rule3Distance)
+			{
+				perceived_velocity += vel[i];
+				++flag2;
+			}
+		}
+	}
+	if (flag1 > 0)
+	{
+		perceived_center /= (float)flag1;
+	}
+	else
+	{
+		perceived_center = pos[iSelf];
+	}
+	
+	if (flag2 > 0)
+	{
+		perceived_velocity /= (float)flag2; 
+	}
+	else
+	{
+		perceived_velocity = vel[iSelf];
+	}
+
+	glm::vec3 m_result1 = (perceived_center - pos[iSelf]) * rule1Scale;
+	glm::vec3 m_result2 = c * rule2Scale;
+	glm::vec3 m_result3 = perceived_velocity * rule3Scale;
+	
+	return m_result1 + m_result2 + m_result3;
 }
 
 /**
@@ -242,9 +326,17 @@ __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *po
 */
 __global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos,
   glm::vec3 *vel1, glm::vec3 *vel2) {
+		
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (index > N) return;
+	glm::vec3 new_velocity = computeVelocityChange(N, index, pos, vel1) + vel1[index];
+	new_velocity = glm::clamp(new_velocity, glm::vec3(0.0f), glm::vec3(1.0f) * maxSpeed);
+	vel2[index] = new_velocity;
   // Compute a new velocity based on pos and vel1
   // Clamp the speed
   // Record the new velocity into vel2. Question: why NOT vel1?
+	// cause the synchronizing problem;
+	
 }
 
 /**
@@ -306,6 +398,31 @@ __global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices,
   // Identify the start point of each cell in the gridIndices array.
   // This is basically a parallel unrolling of a loop that goes
   // "this index doesn't match the one before it, must be a new cell!"
+
+
+	int index = threadIdx.x + (blockIdx.x * blockDim.x);
+	int gridIdx = particleGridIndices[index];
+	if (index > N)
+	{
+		return;
+	}
+	if (index == 0)
+	{
+		gridCellStartIndices[gridIdx] = 0;
+		return;
+	}
+	if (index == N - 1)
+	{
+		gridCellEndIndices[gridIdx] = N - 1;
+	}
+
+	int oldGridIdx = particleGridIndices[index - 1];
+	if (oldGridIdx != gridIdx)
+	{
+		gridCellStartIndices[gridIdx] = index;
+		gridCellEndIndices[oldGridIdx] = index - 1;
+
+	}
 }
 
 __global__ void kernUpdateVelNeighborSearchScattered(
@@ -322,6 +439,97 @@ __global__ void kernUpdateVelNeighborSearchScattered(
   // - Access each boid in the cell and compute velocity change from
   //   the boids rules, if this boid is within the neighborhood distance.
   // - Clamp the speed change before putting the new speed in vel2
+
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+	
+	if (index < N)
+	{
+		int particleIdx = particleArrayIndices[index];
+		glm::vec3 gridIdx3 = (pos[particleIdx] - gridMin) * inverseCellWidth;
+
+		int neighborhoods[9];
+		int neighborIdxx = glm::fract(gridIdx3.x) > 0.5 ? 1 : -1;
+		int neighborIdxy = glm::fract(gridIdx3.y) > 0.5 ? 1 : -1;
+		int neighborIdxz = glm::fract(gridIdx3.z) > 0.5 ? 1 : -1;
+
+		glm::ivec3 offset[8] = {
+			glm::ivec3(0,0,0),
+		glm::ivec3(neighborIdxx, 0, 0),
+		glm::ivec3(0, neighborIdxy, 0),
+		glm::ivec3(0, 0, neighborIdxz),
+		glm::ivec3(neighborIdxx, neighborIdxy, 0),
+		glm::ivec3(neighborIdxx, 0, neighborIdxz),
+		glm::ivec3(0, neighborIdxy, neighborIdxz),
+		glm::ivec3(neighborIdxx, neighborIdxy, neighborIdxz)
+		};
+
+		for (int i = 0; i < 8; i++)
+		{
+			glm::ivec3 idx = glm::ivec3(gridIdx3) + offset[i];
+			if ((idx.x < 0) || (idx.x > gridResolution) || (idx.y < 0) || (idx.y > gridResolution) || (idx.z < 0) || (idx.z > gridResolution))
+			{
+				neighborhoods[i] = gridIndex3Dto1D(idx.x, idx.y, idx.z, gridResolution);
+				
+			}
+			else
+			{
+				neighborhoods[i] = -1;
+			}
+		
+		}
+
+		glm::vec3 perceived_center = glm::vec3(0.0f, 0.0f, 0.0f);
+		glm::vec3 c = glm::vec3(0.0f, 0.0f, 0.0f);
+		glm::vec3 perceived_velocity = glm::vec3(0.0f, 0.0f, 0.0f);
+		int flag1, flag2 = 0;
+		for (int i = 0; i < 8; ++i)
+		{
+			int gridIdx = neighborhoods[i];
+
+			if (gridIdx == -1)
+			{
+				continue;
+			}
+
+			int startIdx = gridCellStartIndices[gridIdx];
+			int endIdx = gridCellEndIndices[gridIdx];
+
+			for (int j = startIdx; j < endIdx; ++j)
+			{
+				int boidIdx = particleArrayIndices[j];
+				if (boidIdx != particleIdx )
+				{
+					float m_distance = glm::distance(pos[boidIdx], pos[particleIdx]);
+					if (m_distance < rule1Distance)
+					{
+						perceived_center += pos[boidIdx];
+						++flag1;
+					}
+					if (m_distance < rule2Distance)
+					{
+						c -= (pos[boidIdx] - pos[particleIdx]);
+					}
+					if (m_distance < rule3Distance)
+					{
+						perceived_velocity += vel1[boidIdx];
+						++flag2;
+					}
+				}
+			}
+
+			if (flag1 > 0) perceived_center /= flag1;
+			else perceived_center = pos[particleIdx];
+
+			if (flag2 > 0)perceived_velocity /= flag2;
+		}
+
+
+		glm::vec3 m_result = (perceived_center - pos[particleIdx]) * rule1Scale + c * rule2Scale + perceived_velocity * rule3Scale;
+
+		if (glm::length(m_result) > maxSpeed) m_result = glm::normalize(m_result) * maxSpeed;
+		vel2[particleIdx] = m_result;
+	}
 }
 
 __global__ void kernUpdateVelNeighborSearchCoherent(
@@ -341,14 +549,114 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
   // - Access each boid in the cell and compute velocity change from
   //   the boids rules, if this boid is within the neighborhood distance.
   // - Clamp the speed change before putting the new speed in vel2
+
+	int index = threadIdx.x + blockIdx.x * blockDim.x;
+	if (index < N)
+	{
+		glm::vec3 girdIdx3 = (pos[index] - gridMin) * inverseCellWidth;
+		int neighborhoods[8];
+		int neighborIdxX = glm::fract(girdIdx3.x) > 0.5 ? 1 : -1;
+		int neighborIdxY = glm::fract(girdIdx3.y) > 0.5 ? 1 : -1;
+		int neighborIdxZ = glm::fract(girdIdx3.z) > 0.5 ? 1 : -1;
+
+		glm::ivec3 offset[8] = {
+			glm::ivec3(0, 0, 0),
+			glm::ivec3(neighborIdxX, 0, 0),
+			glm::ivec3(0, neighborIdxY, 0),
+			glm::ivec3(0, 0, neighborIdxZ),
+			glm::ivec3(neighborIdxX, neighborIdxY, 0),
+			glm::ivec3(neighborIdxX, 0, neighborIdxZ),
+			glm::ivec3(0, neighborIdxY, neighborIdxZ),
+			glm::ivec3(neighborIdxX, neighborIdxY, neighborIdxZ)
+		};
+
+		for (int i = 0; i < 0; ++i)
+		{
+			glm::ivec3 idx = glm::ivec3(girdIdx3) + offset[i];
+			if (idx.x < 0 || idx.x > gridResolution || idx.y < 0 || idx.y > gridResolution || idx.z < 0 || idx.z > gridResolution) {
+				neighborhoods[i] = gridIndex3Dto1D(idx.x, idx.y, idx.z, gridResolution);
+			}
+			else {
+				neighborhoods[i] = -1;
+			}
+		}
+
+		glm::vec3 perceived_center = glm::vec3(0.0f, 0.0f, 0.0f);
+		glm::vec3 c = glm::vec3(0.0f, 0.0f, 0.0f);
+		glm::vec3 perceived_velocity = glm::vec3(0.0f, 0.0f, 0.0f);
+		int flag1, flag2 = 0;
+
+
+		for (int i = 0; i < 8; ++i)
+		{
+			int gridIdx = neighborhoods[i];
+			if (gridIdx == -1)
+			{
+				continue;
+			}
+			int startIdx = gridCellStartIndices[gridIdx];
+			int endIdx = gridCellEndIndices[gridIdx];
+
+			for (int j = startIdx; j < endIdx; ++j)
+			{
+				if (j != index)
+				{
+					float m_distance = glm::distance(pos[j], pos[index]);
+					if (m_distance < rule1Distance)
+					{
+						perceived_center += pos[j];
+						++flag1;
+					}
+					if (m_distance < rule2Distance)
+					{
+						c -= (pos[j] - pos[index]);
+					}
+					if (m_distance < rule3Distance)
+					{
+						perceived_velocity += vel1[j];
+						++flag2;
+					}
+				}
+			}
+		}
+
+		if (flag1 > 0)perceived_center /= flag1;
+		else perceived_center = pos[index];
+
+		if (flag2 > 0) perceived_velocity /= flag2;
+
+		glm::vec3 new_velocity = rule1Scale * (perceived_center - pos[index]) + rule2Scale * c + rule3Scale * perceived_velocity + vel1[index];
+		if (glm::length(new_velocity) > maxSpeed) new_velocity = glm::normalize(new_velocity) * maxSpeed;
+
+		vel2[index] = new_velocity;
+	}
 }
 
+
+__global__  void kernPosAndVel(int N, int* particelArrayIndices, glm::vec3* pos,
+						glm::vec3* vel1, glm::vec3 * s_pos, glm::vec3 * s_vel1)
+{
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (index < N)
+	{
+		int idx = particelArrayIndices[index];
+		s_pos[index] = pos[idx];
+		s_vel1[index] = vel1[idx];
+	}
+}
 /**
 * Step the entire N-body simulation by `dt` seconds.
 */
 void Boids::stepSimulationNaive(float dt) {
   // TODO-1.2 - use the kernels you wrote to step the simulation forward in time.
+	
+	dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+	kernUpdateVelocityBruteForce << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_pos, dev_vel1, dev_vel2);
+	kernUpdatePos << <fullBlocksPerGrid, blockSize >> >(numObjects, dt, dev_pos, dev_vel2);
   // TODO-1.2 ping-pong the velocity buffers
+	glm::vec3 *temp = dev_vel1;
+	dev_vel1 = dev_vel2;
+	dev_vel2 = temp;
 }
 
 void Boids::stepSimulationScatteredGrid(float dt) {
@@ -364,6 +672,23 @@ void Boids::stepSimulationScatteredGrid(float dt) {
   // - Perform velocity updates using neighbor search
   // - Update positions
   // - Ping-pong buffers as needed
+
+
+	dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+
+	kernComputeIndices << <fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+	//thrust::sort_by_key(dev_particleGridIndices, dev_particleGridIndices + numObjects, dev_particleArrayIndices);
+	thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
+	
+	dim3 fullBlocksPerGridCell((gridCellCount + blockSize - 1) / blockSize);
+	kernResetIntBuffer << <fullBlocksPerGridCell, blockSize >> >(gridCellCount, dev_gridCellStartIndices, -1);
+	kernResetIntBuffer << <fullBlocksPerGridCell, blockSize >> >(gridCellCount, dev_gridCellEndIndices, -1);
+	kernIdentifyCellStartEnd << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+	kernUpdateVelNeighborSearchScattered << <fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth, dev_gridCellStartIndices, dev_gridCellEndIndices, dev_particleArrayIndices, dev_pos, dev_vel1, dev_vel2);
+	kernUpdatePos << <fullBlocksPerGrid, blockSize >> >(numObjects, dt, dev_pos, dev_vel2);
+	glm::vec3 *temp = dev_vel1;
+	dev_vel1 = dev_vel2;
+	dev_vel2 = temp;
 }
 
 void Boids::stepSimulationCoherentGrid(float dt) {
@@ -382,6 +707,28 @@ void Boids::stepSimulationCoherentGrid(float dt) {
   // - Perform velocity updates using neighbor search
   // - Update positions
   // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE.
+
+	dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+	kernComputeIndices << <fullBlocksPerGrid, blockSize >> >(numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+	thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
+
+	dim3 fullBlocksPerGridCell((gridCellCount + blockSize - 1) / blockSize);
+	kernResetIntBuffer << <fullBlocksPerGridCell, blockSize >> >(gridCellCount, dev_gridCellStartIndices, -1);
+	kernResetIntBuffer << <fullBlocksPerGridCell, blockSize >> >(gridCellCount, dev_gridCellEndIndices, -1);
+
+	kernIdentifyCellStartEnd << <fullBlocksPerGrid, blockSize >> >(numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+	kernPosAndVel << <fullBlocksPerGrid, blockSize >> >(numObjects, dev_particleArrayIndices, dev_pos, dev_vel1, dev_coherentPos, dev_coherentVel1);
+
+	kernUpdateVelNeighborSearchScattered << <fullBlocksPerGrid, blockSize >> >(numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth, dev_gridCellStartIndices, dev_gridCellEndIndices, dev_particleArrayIndices, dev_coherentPos, dev_coherentVel1, dev_vel2);
+	kernUpdatePos << <fullBlocksPerGrid, blockSize >> >(numObjects, dt, dev_coherentPos, dev_vel2);
+
+	glm::vec3* temp1 = dev_pos;
+	dev_pos = dev_coherentPos;
+	dev_coherentPos = temp1;
+
+	glm::vec3* temp2 = dev_vel1;
+	dev_vel1 = dev_vel2;
+	dev_vel2 = temp2;
 }
 
 void Boids::endSimulation() {
@@ -390,6 +737,13 @@ void Boids::endSimulation() {
   cudaFree(dev_pos);
 
   // TODO-2.1 TODO-2.3 - Free any additional buffers here.
+  cudaFree(dev_gridCellEndIndices);
+  cudaFree(dev_gridCellStartIndices);
+  cudaFree(dev_particleArrayIndices);
+  cudaFree(dev_particleGridIndices);
+  
+  cudaFree(dev_coherentPos);
+  cudaFree(dev_coherentVel1);
 }
 
 void Boids::unitTest() {