diff --git a/README.md b/README.md index d63a6a1..5a2d07a 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,42 @@ **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 1 - Flocking** -* (TODO) YOUR NAME HERE +Dear Grader! + +As you may not know, I registered late for the class(Actually registered on the last day of registration), and I negotiated with Professor Shehzan about due dates via email and my Project 1 due date is Sept 22. However as you can see, I was late for about 26 hours. I know there are 4 late days without penalty but I only want to use ONE on this project. I didn't mean to waste a late day only for the extra 2 hours. You can deduct points for being late for one day (hopefully 2 hours!) + +Thanks + +* Xiao Wei * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Tested on: Windows 10, i9-9900k @ 3.6GHz 16.0GB, RTX 2080 SUPER 16GB ### (TODO: Your README) -Include screenshots, analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +Simulation gif + + +![p1-6](https://user-images.githubusercontent.com/66859615/134622482-b29722aa-0751-4b32-9732-9a65009cc15f.gif) + +![p1-5](https://user-images.githubusercontent.com/66859615/134622494-2c6ffde0-e230-4ce1-b757-a03d36d04777.gif) + + + +![20210924054902](https://user-images.githubusercontent.com/66859615/134626140-86cab152-370f-4cfa-9b1c-79ab40f0faa6.jpg) + +![20210924055435](https://user-images.githubusercontent.com/66859615/134626143-63d35c54-4bd2-4be4-816c-46a207ff12b4.jpg) + +![20210924061425](https://user-images.githubusercontent.com/66859615/134627047-6a582d78-b49f-45cf-bb48-43236121419a.jpg) + +## For each implementation, how does changing the number of boids affect performance? Why do you think this is? +Performance drops as we increase the number of boids. Each boid in the flocking takes resource(thread) and computational resource #to get their velocity and postion. More boids, more work + +## For each implementation, how does changing the block count and block size affect performance? Why do you think this is? +Changing block size actually does not change performance drastically. Acquiring more block only needs simple operation + +## For the coherent uniform grid: did you experience any performance improvements with the more coherent uniform grid? Was this the outcome you expected? Why or why not? +Yes, the performance improved. I expected the outcome since we don't bother to access dev_particleArrayIndices + +## Did changing cell width and checking 27 vs 8 neighboring cells affect performance? Why or why not? Be careful: it is insufficient (and possibly incorrect) to say that 27-cell is slower simply because there are more cells to check! + +27 Cells actually is faster somehow, I guess the smaller 27 cells give more granularity for paralleling. diff --git a/src/kernel.cu b/src/kernel.cu index 74dffcb..904e858 100644 --- a/src/kernel.cu +++ b/src/kernel.cu @@ -5,6 +5,7 @@ #include #include "utilityCore.hpp" #include "kernel.h" +//#include "device_launch_parameters.h" // LOOK-2.1 potentially useful for doing grid-based neighbor search #ifndef imax @@ -85,6 +86,9 @@ int *dev_gridCellEndIndices; // to this cell? // TODO-2.3 - consider what additional buffers you might need to reshuffle // the position and velocity data to be coherent within cells. +glm::vec3* dev_posShuffle; +glm::vec3* dev_velShuffle; + // LOOK-2.1 - Grid parameters based on simulation parameters. // These are automatically computed for you in Boids::initSimulation @@ -169,6 +173,27 @@ void Boids::initSimulation(int N) { gridMinimum.z -= halfGridWidth; // TODO-2.1 TODO-2.3 - Allocate additional buffers here. + + cudaMalloc((void**)&dev_particleArrayIndices, N * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_particleArrayIndices failed!"); + + cudaMalloc((void**)&dev_particleGridIndices, N * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_particleGridIndices failed!"); + + cudaMalloc((void**)&dev_gridCellStartIndices, gridCellCount * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_gridCellStartIndices failed!"); + + cudaMalloc((void**)&dev_gridCellEndIndices, gridCellCount * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_gridCellEndIndices failed!"); + + + cudaMalloc((void**)&dev_posShuffle, N * sizeof(glm::vec3)); + checkCUDAErrorWithLine("cudaMalloc dev_posShuffle failed!"); + + cudaMalloc((void**)&dev_velShuffle, N * sizeof(glm::vec3)); + checkCUDAErrorWithLine("cudaMalloc dev_velShuffle failed!"); + + cudaDeviceSynchronize(); } @@ -229,11 +254,137 @@ void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities) * Compute the new velocity on the body with index `iSelf` due to the `N` boids * in the `pos` and `vel` arrays. */ -__device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *pos, const glm::vec3 *vel) { - // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves - // Rule 2: boids try to stay a distance d away from each other +__device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3* pos, const glm::vec3* vel) { + // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves + /* + function rule1(Boid boid) + + Vector perceived_center + + foreach Boid b: + if b != boid and distance(b, boid) < rule1Distance then + perceived_center += b.position + endif + end + + perceived_center /= number_of_neighbors + + return (perceived_center - boid.position) * rule1Scale + end + + + */ + + + glm::vec3 result(0.0f, 0.0f, 0.0f); + + + glm::vec3 perceivedCenter(0.0f, 0.0f, 0.0f); + int numRule1Neighbor = 0; + + for (int i = 0; i < N; i++) { + if (i == iSelf) { + continue; + } + + float distance = glm::distance(pos[i], pos[iSelf]); + + if (distance < rule1Distance) { + perceivedCenter += pos[i]; + numRule1Neighbor++; + } + + } + + if (numRule1Neighbor > 0) { + perceivedCenter /= numRule1Neighbor; + } + + result += (perceivedCenter - pos[iSelf]) * rule1Scale; + + + // Rule 2: boids try to stay a distance d away from each other + /* + function rule2(Boid boid) + + Vector c = 0 + + foreach Boid b + if b != boid and distance(b, boid) < rule2Distance then + c -= (b.position - boid.position) + endif + end + + return c * rule2Scale + end + */ + + glm::vec3 separate(0.0f, 0.0f, 0.0f); + + for (int i = 0; i < N; i++) { + if (i == iSelf) { + continue; + } + float distance = glm::distance(pos[i], pos[iSelf]); + + if (distance < rule2Distance) { + separate -= (pos[i] - pos[iSelf]); + } + + } + + result += separate * rule2Scale; + + + // Rule 3: boids try to match the speed of surrounding boids - return glm::vec3(0.0f, 0.0f, 0.0f); + /* + function rule3(Boid boid) + + Vector perceived_velocity + + foreach Boid b + if b != boid and distance(b, boid) < rule3Distance then + perceived_velocity += b.velocity + endif + end + + perceived_velocity /= number_of_neighbors + + return perceived_velocity * rule3Scale + end + */ + + glm::vec3 perceivedVelocity(0.0f, 0.0f, 0.0f); + float numRule3Neighbor = 0; + + + for (int i = 0; i < N; i++) { + if (i == iSelf) { + continue; + } + float distance = glm::distance(pos[i], pos[iSelf]); + + if (distance < rule3Distance) { + perceivedVelocity += vel[i]; + numRule3Neighbor++; + } + + } + + if (numRule3Neighbor > 0) { + perceivedVelocity = perceivedVelocity / numRule3Neighbor; + } + + result += perceivedVelocity * rule3Scale; + + + return result; + + + + + } /** @@ -245,6 +396,25 @@ __global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos, // Compute a new velocity based on pos and vel1 // Clamp the speed // Record the new velocity into vel2. Question: why NOT vel1? + + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= N) { + return; + } + + glm::vec3 changedVelocity = vel1[index] + computeVelocityChange(N, index, pos, vel1); + + if (glm::length(changedVelocity) > maxSpeed) { + + changedVelocity = changedVelocity * maxSpeed / glm::length(changedVelocity); + + } + + vel2[index] = changedVelocity; + + + + } /** @@ -289,6 +459,21 @@ __global__ void kernComputeIndices(int N, int gridResolution, // - Label each boid with the index of its grid cell. // - Set up a parallel array of integer indices as pointers to the actual // boid data in pos and vel1/vel2 + + + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= N) { + return; + } + + int iX = std::floor((pos[index].x - gridMin.x) * inverseCellWidth); + int iY = std::floor((pos[index].y - gridMin.y) * inverseCellWidth); + int iZ = std::floor((pos[index].z - gridMin.z) * inverseCellWidth); + + indices[index] = index; + gridIndices[index] = gridIndex3Dto1D(iX, iY, iZ, gridResolution); + + } // LOOK-2.1 Consider how this could be useful for indicating that a cell @@ -306,6 +491,28 @@ __global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices, // Identify the start point of each cell in the gridIndices array. // This is basically a parallel unrolling of a loop that goes // "this index doesn't match the one before it, must be a new cell!" + + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= N) { + return; + } + + int gridIndex = particleGridIndices[index]; + + if (index == 0) { + gridCellStartIndices[gridIndex] = 0; + + } + else { + int gridIndexLeft = particleGridIndices[index - 1]; + + if (gridIndexLeft != gridIndex) { + gridCellStartIndices[gridIndex] = index; + gridCellEndIndices[gridIndexLeft] = index - 1; + } + } + + } __global__ void kernUpdateVelNeighborSearchScattered( @@ -316,12 +523,107 @@ __global__ void kernUpdateVelNeighborSearchScattered( glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) { // TODO-2.1 - Update a boid's velocity using the uniform grid to reduce // the number of boids that need to be checked. + + int iSelf = threadIdx.x + (blockIdx.x * blockDim.x); + if (iSelf >= N) { + return; + } + + + glm::vec3 result(0.0f, 0.0f, 0.0f); + + glm::vec3 perceivedCenter(0.0f, 0.0f, 0.0f); + int numRule1Neighbor = 0; + + glm::vec3 separate(0.0f, 0.0f, 0.0f); + + glm::vec3 perceivedVelocity(0.0f, 0.0f, 0.0f); + float numRule3Neighbor = 0; + // - Identify the grid cell that this particle is in - // - Identify which cells may contain neighbors. This isn't always 8. - // - For each cell, read the start/end indices in the boid pointer array. - // - Access each boid in the cell and compute velocity change from - // the boids rules, if this boid is within the neighborhood distance. + glm::vec3 posSelf = pos[iSelf]; + glm::vec3 gridPos = (posSelf - gridMin) * inverseCellWidth; + glm::vec3 gridPosFloor = floor(gridPos); + glm::vec3 gridPosRound = round(gridPos); + int gridIndex = gridIndex3Dto1D(gridPosFloor.x, gridPosFloor.y, gridPosFloor.z, gridResolution); + + // - Identify which cells may contain neighbors. This isn't always 8. + + for (int x = -1; x < 1; x++) + { + for (int y = -1; y < 1; y++) + { + for (int z = -1; z < 1; z++) + { + int neighborX = gridPosRound.x + x; + int neighborY = gridPosRound.y + y; + int neighborZ = gridPosRound.z + z; + + if (neighborX < 0 || neighborX >= gridResolution || neighborY < 0 || neighborY >= gridResolution || neighborZ < 0 || neighborZ >= gridResolution) + continue; + + // - For each cell, read the start/end indices in the boid pointer array. + int neighborGridIndex = gridIndex3Dto1D(neighborX, neighborY, neighborZ, gridResolution); + + int startIndex = gridCellStartIndices[neighborGridIndex]; + int endIndex = gridCellEndIndices[neighborGridIndex]; + + + // - Access each boid in the cell and compute velocity change from + // the boids rules, if this boid is within the neighborhood distance. + // - Clamp the speed change before putting the new speed in vel2 + + for (int cellIndex = startIndex; cellIndex <= endIndex; cellIndex++) { + int i = particleArrayIndices[cellIndex]; + if (i == iSelf) { + continue; + } + + float distance = glm::distance(pos[i], pos[iSelf]); + + if (distance < rule1Distance) { + perceivedCenter += pos[i]; + numRule1Neighbor++; + } + + if (distance < rule2Distance) { + separate -= (pos[i] - pos[iSelf]); + } + + if (distance < rule3Distance) { + perceivedVelocity += vel1[i]; + numRule3Neighbor++; + } + } + } + } + } + + if (numRule1Neighbor > 0) { + perceivedCenter /= numRule1Neighbor; + } + result += (perceivedCenter - pos[iSelf]) * rule1Scale; + result += separate * rule2Scale; + + if (numRule3Neighbor > 0) { + perceivedVelocity = perceivedVelocity / numRule3Neighbor; + } + + result += perceivedVelocity * rule3Scale; + // - Clamp the speed change before putting the new speed in vel2 + glm::vec3 changedVelocity = vel1[iSelf] + result; + + if (glm::length(changedVelocity) > maxSpeed) { + + changedVelocity = changedVelocity * maxSpeed / glm::length(changedVelocity); + + } + + vel2[iSelf] = changedVelocity; + + + } __global__ void kernUpdateVelNeighborSearchCoherent( @@ -341,6 +643,106 @@ __global__ void kernUpdateVelNeighborSearchCoherent( // - Access each boid in the cell and compute velocity change from // the boids rules, if this boid is within the neighborhood distance. // - Clamp the speed change before putting the new speed in vel2 + + int iSelf = threadIdx.x + (blockIdx.x * blockDim.x); + if (iSelf >= N) { + return; + } + + + glm::vec3 result(0.0f, 0.0f, 0.0f); + + glm::vec3 perceivedCenter(0.0f, 0.0f, 0.0f); + int numRule1Neighbor = 0; + + glm::vec3 separate(0.0f, 0.0f, 0.0f); + + glm::vec3 perceivedVelocity(0.0f, 0.0f, 0.0f); + float numRule3Neighbor = 0; + + // - Identify the grid cell that this particle is in + glm::vec3 posSelf = pos[iSelf]; + glm::vec3 gridPos = (posSelf - gridMin) * inverseCellWidth; + glm::vec3 gridPosFloor = floor(gridPos); + glm::vec3 gridPosRound = round(gridPos); + int gridIndex = gridIndex3Dto1D(gridPosFloor.x, gridPosFloor.y, gridPosFloor.z, gridResolution); + + // - Identify which cells may contain neighbors. This isn't always 8. + + for (int x = -1; x < 1; x++) + { + for (int y = -1; y < 1; y++) + { + for (int z = -1; z < 1; z++) + { + int neighborX = gridPosRound.x + x; + int neighborY = gridPosRound.y + y; + int neighborZ = gridPosRound.z + z; + + if (neighborX < 0 || neighborX >= gridResolution || neighborY < 0 || neighborY >= gridResolution || neighborZ < 0 || neighborZ >= gridResolution) + continue; + + // - For each cell, read the start/end indices in the boid pointer array. + int neighborGridIndex = gridIndex3Dto1D(neighborX, neighborY, neighborZ, gridResolution); + + int startIndex = gridCellStartIndices[neighborGridIndex]; + int endIndex = gridCellEndIndices[neighborGridIndex]; + + + // - Access each boid in the cell and compute velocity change from + // the boids rules, if this boid is within the neighborhood distance. + // - Clamp the speed change before putting the new speed in vel2 + + for (int i = startIndex; i <= endIndex; i++) { + + if (i == iSelf) { + continue; + } + + float distance = glm::distance(pos[i], pos[iSelf]); + + if (distance < rule1Distance) { + perceivedCenter += pos[i]; + numRule1Neighbor++; + } + + if (distance < rule2Distance) { + separate -= (pos[i] - pos[iSelf]); + } + + if (distance < rule3Distance) { + perceivedVelocity += vel1[i]; + numRule3Neighbor++; + } + } + } + } + } + + if (numRule1Neighbor > 0) { + perceivedCenter /= numRule1Neighbor; + } + result += (perceivedCenter - pos[iSelf]) * rule1Scale; + result += separate * rule2Scale; + + if (numRule3Neighbor > 0) { + perceivedVelocity = perceivedVelocity / numRule3Neighbor; + } + + result += perceivedVelocity * rule3Scale; + + // - Clamp the speed change before putting the new speed in vel2 + glm::vec3 changedVelocity = vel1[iSelf] + result; + + if (glm::length(changedVelocity) > maxSpeed) { + + changedVelocity = changedVelocity * maxSpeed / glm::length(changedVelocity); + + } + + vel2[iSelf] = changedVelocity; + + } /** @@ -348,7 +750,12 @@ __global__ void kernUpdateVelNeighborSearchCoherent( */ void Boids::stepSimulationNaive(float dt) { // TODO-1.2 - use the kernels you wrote to step the simulation forward in time. + dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize); + kernUpdateVelocityBruteForce <<> > (numObjects, dev_pos, dev_vel1, dev_vel2); + kernUpdatePos << > > (numObjects, dt, dev_pos, dev_vel2); // TODO-1.2 ping-pong the velocity buffers + dev_vel1 = dev_vel2; + } void Boids::stepSimulationScatteredGrid(float dt) { @@ -364,8 +771,48 @@ void Boids::stepSimulationScatteredGrid(float dt) { // - Perform velocity updates using neighbor search // - Update positions // - Ping-pong buffers as needed + + dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize); + dim3 numGrid((gridCellCount + blockSize - 1) / blockSize); + + kernComputeIndices << > > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, + dev_pos, dev_particleArrayIndices, dev_particleGridIndices); + + thrust::device_ptr dev_thrust_particleGridIndices(dev_particleGridIndices); + thrust::device_ptr dev_thrust_particleArrayIndices(dev_particleArrayIndices); + thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices); + + kernResetIntBuffer << > > (gridCellCount, dev_gridCellStartIndices, -1); + kernResetIntBuffer << > > (gridCellCount, dev_gridCellEndIndices, -1); + + kernIdentifyCellStartEnd << < fullBlocksPerGrid, blockSize >> > (numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices); + kernUpdateVelNeighborSearchScattered << < fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth, + dev_gridCellStartIndices, dev_gridCellEndIndices, dev_particleArrayIndices, dev_pos, dev_vel1, dev_vel2); + kernUpdatePos << < fullBlocksPerGrid, blockSize >> > (numObjects, dt, dev_pos, dev_vel2); + glm::vec3* temp; + temp = dev_vel2; + dev_vel2 = dev_vel1; + dev_vel1 = temp; + + +} + +__global__ void kernShufflePosVel(int N, int* particleArrayIndices, + glm::vec3* pos, glm::vec3* vel, + glm::vec3* posShuffled, glm::vec3* velShuffled) { + + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= N) { + return; + } + + int particleArrayIndex = particleArrayIndices[index]; + + posShuffled[index] = pos[particleArrayIndex]; + velShuffled[index] = vel[particleArrayIndex]; } + void Boids::stepSimulationCoherentGrid(float dt) { // TODO-2.3 - start by copying Boids::stepSimulationNaiveGrid // Uniform Grid Neighbor search using Thrust sort on cell-coherent data. @@ -382,6 +829,38 @@ void Boids::stepSimulationCoherentGrid(float dt) { // - Perform velocity updates using neighbor search // - Update positions // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE. + + dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize); + dim3 numGrid((gridCellCount + blockSize - 1) / blockSize); + + kernComputeIndices << > > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, + dev_pos, dev_particleArrayIndices, dev_particleGridIndices); + + thrust::device_ptr dev_thrust_particleGridIndices(dev_particleGridIndices); + thrust::device_ptr dev_thrust_particleArrayIndices(dev_particleArrayIndices); + thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices); + + kernResetIntBuffer << > > (gridCellCount, dev_gridCellStartIndices, -1); + kernResetIntBuffer << > > (gridCellCount, dev_gridCellEndIndices, -1); + + kernIdentifyCellStartEnd << < fullBlocksPerGrid, blockSize >> > (numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices); + + + + kernShufflePosVel << < fullBlocksPerGrid, blockSize >> > (numObjects, dev_particleArrayIndices, dev_pos, dev_vel1, dev_posShuffle, dev_velShuffle); + cudaMemcpy(dev_pos, dev_posShuffle, numObjects * sizeof(glm::vec3), cudaMemcpyDeviceToDevice); + cudaMemcpy(dev_vel1, dev_velShuffle, numObjects * sizeof(glm::vec3), cudaMemcpyDeviceToDevice); + kernUpdateVelNeighborSearchCoherent << > > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth, + dev_gridCellStartIndices, dev_gridCellEndIndices, + dev_pos, dev_vel1, dev_vel2); + kernUpdatePos << < fullBlocksPerGrid, blockSize >> > (numObjects, dt, dev_pos, dev_vel2); + glm::vec3* temp; + temp = dev_vel2; + dev_vel2 = dev_vel1; + dev_vel1 = temp; + + + } void Boids::endSimulation() { @@ -390,6 +869,14 @@ void Boids::endSimulation() { cudaFree(dev_pos); // TODO-2.1 TODO-2.3 - Free any additional buffers here. + + cudaFree(dev_particleGridIndices); + cudaFree(dev_particleArrayIndices); + cudaFree(dev_gridCellStartIndices); + cudaFree(dev_gridCellEndIndices); + + cudaFree(dev_posShuffle); + cudaFree(dev_velShuffle); } void Boids::unitTest() { diff --git a/src/main.cpp b/src/main.cpp index b82c8c6..a7b8586 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -18,7 +18,7 @@ #define COHERENT_GRID 0 // LOOK-1.2 - change this to adjust particle count in the simulation -const int N_FOR_VIS = 5000; +const int N_FOR_VIS = 10000; const float DT = 0.2f; /**