From 46fff41f3d35ba55131cce49e63a167561ea77e5 Mon Sep 17 00:00:00 2001 From: FridaWang <379788704@qq.com> Date: Fri, 9 Sep 2022 01:19:49 -0400 Subject: [PATCH 1/7] finish part2.3 --- src/kernel.cu | 833 +++++++++++++++++++++++++++++++++++--------------- src/main.cpp | 6 +- 2 files changed, 584 insertions(+), 255 deletions(-) diff --git a/src/kernel.cu b/src/kernel.cu index 74dffcb..c7002e1 100644 --- a/src/kernel.cu +++ b/src/kernel.cu @@ -20,15 +20,15 @@ /** * Check for CUDA errors; print and exit if there was a problem. */ -void checkCUDAError(const char *msg, int line = -1) { - cudaError_t err = cudaGetLastError(); - if (cudaSuccess != err) { - if (line >= 0) { - fprintf(stderr, "Line %d: ", line); +void checkCUDAError(const char* msg, int line = -1) { + cudaError_t err = cudaGetLastError(); + if (cudaSuccess != err) { + if (line >= 0) { + fprintf(stderr, "Line %d: ", line); + } + fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString(err)); + exit(EXIT_FAILURE); } - fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString(err)); - exit(EXIT_FAILURE); - } } @@ -66,25 +66,26 @@ dim3 threadsPerBlock(blockSize); // Consider why you would need two velocity buffers in a simulation where each // boid cares about its neighbors' velocities. // These are called ping-pong buffers. -glm::vec3 *dev_pos; -glm::vec3 *dev_vel1; -glm::vec3 *dev_vel2; +glm::vec3* dev_pos; +glm::vec3* dev_vel1; +glm::vec3* dev_vel2; // LOOK-2.1 - these are NOT allocated for you. You'll have to set up the thrust // pointers on your own too. // For efficient sorting and the uniform grid. These should always be parallel. -int *dev_particleArrayIndices; // What index in dev_pos and dev_velX represents this particle? -int *dev_particleGridIndices; // What grid cell is this particle in? +int* dev_particleArrayIndices; // What index in dev_pos and dev_velX represents this particle? +int* dev_particleGridIndices; // What grid cell is this particle in? // needed for use with thrust thrust::device_ptr dev_thrust_particleArrayIndices; thrust::device_ptr dev_thrust_particleGridIndices; -int *dev_gridCellStartIndices; // What part of dev_particleArrayIndices belongs -int *dev_gridCellEndIndices; // to this cell? +int* dev_gridCellStartIndices; // What part of dev_particleArrayIndices belongs +int* dev_gridCellEndIndices; // to this cell? // TODO-2.3 - consider what additional buffers you might need to reshuffle // the position and velocity data to be coherent within cells. +glm::vec3* dev_sorted_pos; // LOOK-2.1 - Grid parameters based on simulation parameters. // These are automatically computed for you in Boids::initSimulation @@ -99,13 +100,13 @@ glm::vec3 gridMinimum; ******************/ __host__ __device__ unsigned int hash(unsigned int a) { - a = (a + 0x7ed55d16) + (a << 12); - a = (a ^ 0xc761c23c) ^ (a >> 19); - a = (a + 0x165667b1) + (a << 5); - a = (a + 0xd3a2646c) ^ (a << 9); - a = (a + 0xfd7046c5) + (a << 3); - a = (a ^ 0xb55a4f09) ^ (a >> 16); - return a; + a = (a + 0x7ed55d16) + (a << 12); + a = (a ^ 0xc761c23c) ^ (a >> 19); + a = (a + 0x165667b1) + (a << 5); + a = (a + 0xd3a2646c) ^ (a << 9); + a = (a + 0xfd7046c5) + (a << 3); + a = (a ^ 0xb55a4f09) ^ (a >> 16); + return a; } /** @@ -113,63 +114,73 @@ __host__ __device__ unsigned int hash(unsigned int a) { * Function for generating a random vec3. */ __host__ __device__ glm::vec3 generateRandomVec3(float time, int index) { - thrust::default_random_engine rng(hash((int)(index * time))); - thrust::uniform_real_distribution unitDistrib(-1, 1); + thrust::default_random_engine rng(hash((int)(index * time))); + thrust::uniform_real_distribution unitDistrib(-1, 1); - return glm::vec3((float)unitDistrib(rng), (float)unitDistrib(rng), (float)unitDistrib(rng)); + return glm::vec3((float)unitDistrib(rng), (float)unitDistrib(rng), (float)unitDistrib(rng)); } /** * LOOK-1.2 - This is a basic CUDA kernel. * CUDA kernel for generating boids with a specified mass randomly around the star. */ -__global__ void kernGenerateRandomPosArray(int time, int N, glm::vec3 * arr, float scale) { - int index = (blockIdx.x * blockDim.x) + threadIdx.x; - if (index < N) { - glm::vec3 rand = generateRandomVec3(time, index); - arr[index].x = scale * rand.x; - arr[index].y = scale * rand.y; - arr[index].z = scale * rand.z; - } +__global__ void kernGenerateRandomPosArray(int time, int N, glm::vec3* arr, float scale) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index < N) { + glm::vec3 rand = generateRandomVec3(time, index); + arr[index].x = scale * rand.x; + arr[index].y = scale * rand.y; + arr[index].z = scale * rand.z; + } } /** * Initialize memory, update some globals */ void Boids::initSimulation(int N) { - numObjects = N; - dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize); - - // LOOK-1.2 - This is basic CUDA memory management and error checking. - // Don't forget to cudaFree in Boids::endSimulation. - cudaMalloc((void**)&dev_pos, N * sizeof(glm::vec3)); - checkCUDAErrorWithLine("cudaMalloc dev_pos failed!"); - - cudaMalloc((void**)&dev_vel1, N * sizeof(glm::vec3)); - checkCUDAErrorWithLine("cudaMalloc dev_vel1 failed!"); - - cudaMalloc((void**)&dev_vel2, N * sizeof(glm::vec3)); - checkCUDAErrorWithLine("cudaMalloc dev_vel2 failed!"); - - // LOOK-1.2 - This is a typical CUDA kernel invocation. - kernGenerateRandomPosArray<<>>(1, numObjects, - dev_pos, scene_scale); - checkCUDAErrorWithLine("kernGenerateRandomPosArray failed!"); - - // LOOK-2.1 computing grid params - gridCellWidth = 2.0f * std::max(std::max(rule1Distance, rule2Distance), rule3Distance); - int halfSideCount = (int)(scene_scale / gridCellWidth) + 1; - gridSideCount = 2 * halfSideCount; - - gridCellCount = gridSideCount * gridSideCount * gridSideCount; - gridInverseCellWidth = 1.0f / gridCellWidth; - float halfGridWidth = gridCellWidth * halfSideCount; - gridMinimum.x -= halfGridWidth; - gridMinimum.y -= halfGridWidth; - gridMinimum.z -= halfGridWidth; - - // TODO-2.1 TODO-2.3 - Allocate additional buffers here. - cudaDeviceSynchronize(); + numObjects = N; + dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize); + + // LOOK-1.2 - This is basic CUDA memory management and error checking. + // Don't forget to cudaFree in Boids::endSimulation. + cudaMalloc((void**)&dev_pos, N * sizeof(glm::vec3)); + checkCUDAErrorWithLine("cudaMalloc dev_pos failed!"); + + cudaMalloc((void**)&dev_vel1, N * sizeof(glm::vec3)); + checkCUDAErrorWithLine("cudaMalloc dev_vel1 failed!"); + + cudaMalloc((void**)&dev_vel2, N * sizeof(glm::vec3)); + checkCUDAErrorWithLine("cudaMalloc dev_vel2 failed!"); + + // LOOK-1.2 - This is a typical CUDA kernel invocation. + kernGenerateRandomPosArray << > > (1, numObjects, + dev_pos, scene_scale); + checkCUDAErrorWithLine("kernGenerateRandomPosArray failed!"); + + // LOOK-2.1 computing grid params + gridCellWidth = 2.0f * std::max(std::max(rule1Distance, rule2Distance), rule3Distance); + int halfSideCount = (int)(scene_scale / gridCellWidth) + 1; + gridSideCount = 2 * halfSideCount; + + gridCellCount = gridSideCount * gridSideCount * gridSideCount; + gridInverseCellWidth = 1.0f / gridCellWidth; + float halfGridWidth = gridCellWidth * halfSideCount; + gridMinimum.x -= halfGridWidth; + gridMinimum.y -= halfGridWidth; + gridMinimum.z -= halfGridWidth; + + // TODO-2.1 TODO-2.3 - Allocate additional buffers here. + cudaMalloc((void**)&dev_particleArrayIndices, N * sizeof(int)); + cudaMalloc((void**)&dev_particleGridIndices, N * sizeof(int)); + cudaMalloc((void**)&dev_gridCellStartIndices, gridCellCount * sizeof(int)); + cudaMalloc((void**)&dev_gridCellEndIndices, gridCellCount * sizeof(int)); + + dev_thrust_particleArrayIndices = thrust::device_ptr(dev_particleArrayIndices); + dev_thrust_particleGridIndices = thrust::device_ptr(dev_particleGridIndices); + + cudaMalloc((void**)&dev_sorted_pos, N * sizeof(glm::vec3)); + + cudaDeviceSynchronize(); } @@ -180,42 +191,42 @@ void Boids::initSimulation(int N) { /** * Copy the boid positions into the VBO so that they can be drawn by OpenGL. */ -__global__ void kernCopyPositionsToVBO(int N, glm::vec3 *pos, float *vbo, float s_scale) { - int index = threadIdx.x + (blockIdx.x * blockDim.x); +__global__ void kernCopyPositionsToVBO(int N, glm::vec3* pos, float* vbo, float s_scale) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); - float c_scale = -1.0f / s_scale; + float c_scale = -1.0f / s_scale; - if (index < N) { - vbo[4 * index + 0] = pos[index].x * c_scale; - vbo[4 * index + 1] = pos[index].y * c_scale; - vbo[4 * index + 2] = pos[index].z * c_scale; - vbo[4 * index + 3] = 1.0f; - } + if (index < N) { + vbo[4 * index + 0] = pos[index].x * c_scale; + vbo[4 * index + 1] = pos[index].y * c_scale; + vbo[4 * index + 2] = pos[index].z * c_scale; + vbo[4 * index + 3] = 1.0f; + } } -__global__ void kernCopyVelocitiesToVBO(int N, glm::vec3 *vel, float *vbo, float s_scale) { - int index = threadIdx.x + (blockIdx.x * blockDim.x); +__global__ void kernCopyVelocitiesToVBO(int N, glm::vec3* vel, float* vbo, float s_scale) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); - if (index < N) { - vbo[4 * index + 0] = vel[index].x + 0.3f; - vbo[4 * index + 1] = vel[index].y + 0.3f; - vbo[4 * index + 2] = vel[index].z + 0.3f; - vbo[4 * index + 3] = 1.0f; - } + if (index < N) { + vbo[4 * index + 0] = vel[index].x + 0.3f; + vbo[4 * index + 1] = vel[index].y + 0.3f; + vbo[4 * index + 2] = vel[index].z + 0.3f; + vbo[4 * index + 3] = 1.0f; + } } /** * Wrapper for call to the kernCopyboidsToVBO CUDA kernel. */ -void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities) { - dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize); +void Boids::copyBoidsToVBO(float* vbodptr_positions, float* vbodptr_velocities) { + dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize); - kernCopyPositionsToVBO << > >(numObjects, dev_pos, vbodptr_positions, scene_scale); - kernCopyVelocitiesToVBO << > >(numObjects, dev_vel1, vbodptr_velocities, scene_scale); + kernCopyPositionsToVBO << > > (numObjects, dev_pos, vbodptr_positions, scene_scale); + kernCopyVelocitiesToVBO << > > (numObjects, dev_vel1, vbodptr_velocities, scene_scale); - checkCUDAErrorWithLine("copyBoidsToVBO failed!"); + checkCUDAErrorWithLine("copyBoidsToVBO failed!"); - cudaDeviceSynchronize(); + cudaDeviceSynchronize(); } @@ -229,47 +240,90 @@ void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities) * Compute the new velocity on the body with index `iSelf` due to the `N` boids * in the `pos` and `vel` arrays. */ -__device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *pos, const glm::vec3 *vel) { - // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves - // Rule 2: boids try to stay a distance d away from each other - // Rule 3: boids try to match the speed of surrounding boids - return glm::vec3(0.0f, 0.0f, 0.0f); +__device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3* pos, const glm::vec3* vel) { + // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves + // Rule 2: boids try to stay a distance d away from each other + // Rule 3: boids try to match the speed of surrounding boids + glm::vec3 perceivedCenter(0.f, 0.f, 0.f); + glm::vec3 seperation(0.f, 0.f, 0.f); + glm::vec3 perceivedVel(0.f, 0.f, 0.f); + + glm::vec3 dVel(0.f, 0.f, 0.f); + + int numOfNeighborsRule1 = 0, numOfNeighborsRule3 = 0; + + for (int i = 0; i < N; i++) { + if (i != iSelf && glm::distance(pos[i], pos[iSelf]) < rule1Distance) { + perceivedCenter += pos[i]; + ++numOfNeighborsRule1; + } + if (i != iSelf && glm::distance(pos[i], pos[iSelf]) < rule2Distance) { + seperation -= (pos[i] - pos[iSelf]); + } + if (i != iSelf && glm::distance(pos[i], pos[iSelf]) < rule3Distance) { + perceivedVel += vel[i]; + ++numOfNeighborsRule3; + } + } + + if (numOfNeighborsRule1 > 0) { + perceivedCenter /= numOfNeighborsRule1; + dVel += (perceivedCenter - pos[iSelf]) * rule1Scale; + } + + dVel += seperation * rule2Scale; + + if (numOfNeighborsRule3 > 0) { + perceivedVel /= numOfNeighborsRule3; + dVel += perceivedVel * rule3Scale; + } + return dVel; } /** * TODO-1.2 implement basic flocking * For each of the `N` bodies, update its position based on its current velocity. */ -__global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos, - glm::vec3 *vel1, glm::vec3 *vel2) { - // Compute a new velocity based on pos and vel1 - // Clamp the speed - // Record the new velocity into vel2. Question: why NOT vel1? +__global__ void kernUpdateVelocityBruteForce(int N, glm::vec3* pos, + glm::vec3* vel1, glm::vec3* vel2) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= N) { + return; + } + // Compute a new velocity based on pos and vel1 + glm::vec3 newVel = vel1[index] + computeVelocityChange(N, index, pos, vel1); + // Clamp the speed + float speed = glm::length(newVel); + if (speed > maxSpeed) { + newVel = newVel / speed * maxSpeed; + } + // Record the new velocity into vel2. Question: why NOT vel1? + vel2[index] = newVel; } /** * LOOK-1.2 Since this is pretty trivial, we implemented it for you. * For each of the `N` bodies, update its position based on its current velocity. */ -__global__ void kernUpdatePos(int N, float dt, glm::vec3 *pos, glm::vec3 *vel) { - // Update position by velocity - int index = threadIdx.x + (blockIdx.x * blockDim.x); - if (index >= N) { - return; - } - glm::vec3 thisPos = pos[index]; - thisPos += vel[index] * dt; +__global__ void kernUpdatePos(int N, float dt, glm::vec3* pos, glm::vec3* vel) { + // Update position by velocity + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= N) { + return; + } + glm::vec3 thisPos = pos[index]; + thisPos += vel[index] * dt; - // Wrap the boids around so we don't lose them - thisPos.x = thisPos.x < -scene_scale ? scene_scale : thisPos.x; - thisPos.y = thisPos.y < -scene_scale ? scene_scale : thisPos.y; - thisPos.z = thisPos.z < -scene_scale ? scene_scale : thisPos.z; + // Wrap the boids around so we don't lose them + thisPos.x = thisPos.x < -scene_scale ? scene_scale : thisPos.x; + thisPos.y = thisPos.y < -scene_scale ? scene_scale : thisPos.y; + thisPos.z = thisPos.z < -scene_scale ? scene_scale : thisPos.z; - thisPos.x = thisPos.x > scene_scale ? -scene_scale : thisPos.x; - thisPos.y = thisPos.y > scene_scale ? -scene_scale : thisPos.y; - thisPos.z = thisPos.z > scene_scale ? -scene_scale : thisPos.z; + thisPos.x = thisPos.x > scene_scale ? -scene_scale : thisPos.x; + thisPos.y = thisPos.y > scene_scale ? -scene_scale : thisPos.y; + thisPos.z = thisPos.z > scene_scale ? -scene_scale : thisPos.z; - pos[index] = thisPos; + pos[index] = thisPos; } // LOOK-2.1 Consider this method of computing a 1D index from a 3D grid index. @@ -279,179 +333,454 @@ __global__ void kernUpdatePos(int N, float dt, glm::vec3 *pos, glm::vec3 *vel) { // for(y) // for(z)? Or some other order? __device__ int gridIndex3Dto1D(int x, int y, int z, int gridResolution) { - return x + y * gridResolution + z * gridResolution * gridResolution; + return x + y * gridResolution + z * gridResolution * gridResolution; } __global__ void kernComputeIndices(int N, int gridResolution, - glm::vec3 gridMin, float inverseCellWidth, - glm::vec3 *pos, int *indices, int *gridIndices) { + glm::vec3 gridMin, float inverseCellWidth, + glm::vec3* pos, int* indices, int* gridIndices) { // TODO-2.1 // - Label each boid with the index of its grid cell. // - Set up a parallel array of integer indices as pointers to the actual // boid data in pos and vel1/vel2 + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= N) { + return; + } + + indices[index] = index; + glm::ivec3 gridIndex = glm::floor((pos[index] - gridMin) * inverseCellWidth); + gridIndices[index] = gridIndex3Dto1D(gridIndex.x, gridIndex.y, gridIndex.z, gridResolution); } // LOOK-2.1 Consider how this could be useful for indicating that a cell // does not enclose any boids -__global__ void kernResetIntBuffer(int N, int *intBuffer, int value) { - int index = (blockIdx.x * blockDim.x) + threadIdx.x; - if (index < N) { - intBuffer[index] = value; - } +__global__ void kernResetIntBuffer(int N, int* intBuffer, int value) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index < N) { + intBuffer[index] = value; + } } -__global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices, - int *gridCellStartIndices, int *gridCellEndIndices) { - // TODO-2.1 - // Identify the start point of each cell in the gridIndices array. - // This is basically a parallel unrolling of a loop that goes - // "this index doesn't match the one before it, must be a new cell!" +__global__ void kernIdentifyCellStartEnd(int N, int* particleGridIndices, + int* gridCellStartIndices, int* gridCellEndIndices) { + // TODO-2.1 + // Identify the start point of each cell in the gridIndices array. + // This is basically a parallel unrolling of a loop that goes + // "this index doesn't match the one before it, must be a new cell!" + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= N) { + return; + } + // if first cell or index doesn't match the one before it, must be a start index + if (index == 0 || particleGridIndices[index] != particleGridIndices[index - 1]) { + gridCellStartIndices[particleGridIndices[index]] = index; + } + // if last cell or index doesn't match the one after it, must be a end index + if (index == N - 1 || particleGridIndices[index] != particleGridIndices[index + 1]) { + gridCellEndIndices[particleGridIndices[index]] = index; + } } __global__ void kernUpdateVelNeighborSearchScattered( - int N, int gridResolution, glm::vec3 gridMin, - float inverseCellWidth, float cellWidth, - int *gridCellStartIndices, int *gridCellEndIndices, - int *particleArrayIndices, - glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) { - // TODO-2.1 - Update a boid's velocity using the uniform grid to reduce - // the number of boids that need to be checked. - // - Identify the grid cell that this particle is in - // - Identify which cells may contain neighbors. This isn't always 8. - // - For each cell, read the start/end indices in the boid pointer array. - // - Access each boid in the cell and compute velocity change from - // the boids rules, if this boid is within the neighborhood distance. - // - Clamp the speed change before putting the new speed in vel2 + int N, int gridResolution, glm::vec3 gridMin, + float inverseCellWidth, float cellWidth, + int* gridCellStartIndices, int* gridCellEndIndices, + int* particleArrayIndices, + glm::vec3* pos, glm::vec3* vel1, glm::vec3* vel2) { + // TODO-2.1 - Update a boid's velocity using the uniform grid to reduce + // the number of boids that need to be checked. + + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= N) { + return; + } + + // - Identify the grid cell that this particle is in + int selfIdx = particleArrayIndices[index]; + glm::vec3 selfPos = pos[selfIdx]; + selfPos = glm::floor((selfPos - gridMin) * inverseCellWidth); + + // - Identify which cells may contain neighbors. This isn't always 8. + int minX, maxX, minY, maxY, minZ, maxZ; + + minX = imax(selfPos.x - 1, 0); + maxX = imin(selfPos.x + 1, gridResolution - 1); + minY = imax(selfPos.y - 1, 0); + maxY = imin(selfPos.y + 1, gridResolution - 1); + minZ = imax(selfPos.z - 1, 0); + maxZ = imin(selfPos.z + 1, gridResolution - 1); + + glm::vec3 perceivedCenter(0.f, 0.f, 0.f); + glm::vec3 seperation(0.f, 0.f, 0.f); + glm::vec3 perceivedVel(0.f, 0.f, 0.f); + + glm::vec3 dVel(0.f, 0.f, 0.f); + + int numOfNeighborsRule1 = 0, numOfNeighborsRule3 = 0; + + for (int z = minZ; z <= maxZ; ++z) { + for (int y = minY; y <= maxY; ++y) { + for (int x = minX; x <= maxX; ++x) { + int cellIdx = gridIndex3Dto1D(x, y, z, gridResolution); + // - For each cell, read the start/end indices in the boid pointer array. + int startIndex = gridCellStartIndices[cellIdx]; + int endIndex = gridCellEndIndices[cellIdx]; + + if (startIndex != -1) { + // - Access each boid in the cell and compute velocity change from + // the boids rules, if this boid is within the neighborhood distance. + for (int i = startIndex; i <= endIndex; ++i) { + int boidIdx = particleArrayIndices[i]; + if (boidIdx != selfIdx) { + float distance = glm::distance(pos[boidIdx], pos[selfIdx]); + if (distance < rule1Distance) { + perceivedCenter += pos[boidIdx]; + ++numOfNeighborsRule1; + } + if (distance < rule2Distance) { + seperation -= (pos[boidIdx] - pos[selfIdx]); + } + if (distance < rule3Distance) { + perceivedVel += vel1[boidIdx]; + ++numOfNeighborsRule3; + } + } + } + } + } + } + } + + if (numOfNeighborsRule1 > 0) { + perceivedCenter /= numOfNeighborsRule1; + dVel += (perceivedCenter - pos[selfIdx]) * rule1Scale; + } + + dVel += seperation * rule2Scale; + + if (numOfNeighborsRule3 > 0) { + perceivedVel /= numOfNeighborsRule3; + dVel += perceivedVel * rule3Scale; + } + + // - Clamp the speed change before putting the new speed in vel2 + glm::vec3 tmpVel = vel1[selfIdx] + dVel; + float speed = glm::length(tmpVel); + if (speed > maxSpeed) { + tmpVel = tmpVel / speed * maxSpeed; + } + + vel2[selfIdx] = tmpVel; } +__global__ void kernReshuffleBuffer(int N, int* particleArrayIndices, glm::vec3* pos, + glm::vec3* sortedPos, glm::vec3* vel1, glm::vec3* vel2) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= N) { + return; + } + sortedPos[index] = pos[particleArrayIndices[index]]; + vel2[index] = vel1[particleArrayIndices[index]]; +} __global__ void kernUpdateVelNeighborSearchCoherent( - int N, int gridResolution, glm::vec3 gridMin, - float inverseCellWidth, float cellWidth, - int *gridCellStartIndices, int *gridCellEndIndices, - glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) { - // TODO-2.3 - This should be very similar to kernUpdateVelNeighborSearchScattered, - // except with one less level of indirection. - // This should expect gridCellStartIndices and gridCellEndIndices to refer - // directly to pos and vel1. - // - Identify the grid cell that this particle is in - // - Identify which cells may contain neighbors. This isn't always 8. - // - For each cell, read the start/end indices in the boid pointer array. - // DIFFERENCE: For best results, consider what order the cells should be - // checked in to maximize the memory benefits of reordering the boids data. - // - Access each boid in the cell and compute velocity change from - // the boids rules, if this boid is within the neighborhood distance. - // - Clamp the speed change before putting the new speed in vel2 + int N, int gridResolution, glm::vec3 gridMin, + float inverseCellWidth, float cellWidth, + int* gridCellStartIndices, int* gridCellEndIndices, + glm::vec3* pos, glm::vec3* vel1, glm::vec3* vel2) { + // TODO-2.3 - This should be very similar to kernUpdateVelNeighborSearchScattered, + // except with one less level of indirection. + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= N) { + return; + } + + // - Identify the grid cell that this particle is in + int selfIdx = index; + glm::vec3 selfPos = pos[selfIdx]; + selfPos = glm::floor((selfPos - gridMin) * inverseCellWidth); + + // - Identify which cells may contain neighbors. This isn't always 8. + int minX, maxX, minY, maxY, minZ, maxZ; + + minX = imax(selfPos.x - 1, 0); + maxX = imin(selfPos.x + 1, gridResolution - 1); + minY = imax(selfPos.y - 1, 0); + maxY = imin(selfPos.y + 1, gridResolution - 1); + minZ = imax(selfPos.z - 1, 0); + maxZ = imin(selfPos.z + 1, gridResolution - 1); + + glm::vec3 perceivedCenter(0.f, 0.f, 0.f); + glm::vec3 seperation(0.f, 0.f, 0.f); + glm::vec3 perceivedVel(0.f, 0.f, 0.f); + + glm::vec3 dVel(0.f, 0.f, 0.f); + + int numOfNeighborsRule1 = 0, numOfNeighborsRule3 = 0; + + for (int z = minZ; z <= maxZ; ++z) { + for (int y = minY; y <= maxY; ++y) { + for (int x = minX; x <= maxX; ++x) { + int cellIdx = gridIndex3Dto1D(x, y, z, gridResolution); + // - For each cell, read the start/end indices in the boid pointer array. + int startIndex = gridCellStartIndices[cellIdx]; + int endIndex = gridCellEndIndices[cellIdx]; + + if (startIndex != -1) { + // - Access each boid in the cell and compute velocity change from + // the boids rules, if this boid is within the neighborhood distance. + for (int i = startIndex; i <= endIndex; ++i) { + int boidIdx = i; + if (boidIdx != selfIdx) { + float distance = glm::distance(pos[boidIdx], pos[selfIdx]); + if (distance < rule1Distance) { + perceivedCenter += pos[boidIdx]; + ++numOfNeighborsRule1; + } + if (distance < rule2Distance) { + seperation -= (pos[boidIdx] - pos[selfIdx]); + } + if (distance < rule3Distance) { + perceivedVel += vel1[boidIdx]; + ++numOfNeighborsRule3; + } + } + } + } + } + } + } + + if (numOfNeighborsRule1 > 0) { + perceivedCenter /= numOfNeighborsRule1; + dVel += (perceivedCenter - pos[selfIdx]) * rule1Scale; + } + + dVel += seperation * rule2Scale; + + if (numOfNeighborsRule3 > 0) { + perceivedVel /= numOfNeighborsRule3; + dVel += perceivedVel * rule3Scale; + } + + // - Clamp the speed change before putting the new speed in vel2 + glm::vec3 tmpVel = vel1[selfIdx] + dVel; + float speed = glm::length(tmpVel); + if (speed > maxSpeed) { + tmpVel = tmpVel / speed * maxSpeed; + } + + vel2[selfIdx] = tmpVel; } /** * Step the entire N-body simulation by `dt` seconds. */ void Boids::stepSimulationNaive(float dt) { - // TODO-1.2 - use the kernels you wrote to step the simulation forward in time. - // TODO-1.2 ping-pong the velocity buffers + // TODO-1.2 - use the kernels you wrote to step the simulation forward in time. + dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize); + + kernUpdateVelocityBruteForce << > > (numObjects, dev_pos, dev_vel1, dev_vel2); + checkCUDAErrorWithLine("kernUpdateVelocityBruteForce failed!"); + + kernUpdatePos << > > (numObjects, dt, dev_pos, dev_vel2); + checkCUDAErrorWithLine("kernUpdatePos failed!"); + + // TODO-1.2 ping-pong the velocity buffers + cudaMemcpy(dev_vel1, dev_vel2, sizeof(glm::vec3) * numObjects, cudaMemcpyDeviceToDevice); } void Boids::stepSimulationScatteredGrid(float dt) { - // TODO-2.1 - // Uniform Grid Neighbor search using Thrust sort. - // In Parallel: - // - label each particle with its array index as well as its grid index. - // Use 2x width grids. - // - Unstable key sort using Thrust. A stable sort isn't necessary, but you - // are welcome to do a performance comparison. - // - Naively unroll the loop for finding the start and end indices of each - // cell's data pointers in the array of boid indices - // - Perform velocity updates using neighbor search - // - Update positions - // - Ping-pong buffers as needed + // TODO-2.1 + // Uniform Grid Neighbor search using Thrust sort. + dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize); + + // In Parallel: + // - label each particle with its array index as well as its grid index. + // Use 2x width grids. + kernComputeIndices << > > (numObjects, + gridSideCount, + gridMinimum, + gridInverseCellWidth, + dev_pos, + dev_particleArrayIndices, + dev_particleGridIndices); + + // - Unstable key sort using Thrust. A stable sort isn't necessary, but you + // are welcome to do a performance comparison. + thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices); + + // - Naively unroll the loop for finding the start and end indices of each + // cell's data pointers in the array of boid indices + dim3 fullBlocksPerGridCell((gridCellCount + blockSize - 1) / blockSize); + // Reset the startIndex array + kernResetIntBuffer << > > (gridCellCount, + dev_gridCellStartIndices, + -1); + + kernIdentifyCellStartEnd << > > (numObjects, + dev_particleGridIndices, + dev_gridCellStartIndices, + dev_gridCellEndIndices); + + // - Perform velocity updates using neighbor search + kernUpdateVelNeighborSearchScattered << > > (numObjects, + gridSideCount, + gridMinimum, + gridInverseCellWidth, + gridCellWidth, + dev_gridCellStartIndices, + dev_gridCellEndIndices, + dev_particleArrayIndices, + dev_pos, + dev_vel1, + dev_vel2); + + // - Update positions + kernUpdatePos << > > (numObjects, dt, dev_pos, dev_vel2); + + // - Ping-pong buffers as needed + cudaMemcpy(dev_vel1, dev_vel2, sizeof(glm::vec3) * numObjects, cudaMemcpyDeviceToDevice); } void Boids::stepSimulationCoherentGrid(float dt) { - // TODO-2.3 - start by copying Boids::stepSimulationNaiveGrid - // Uniform Grid Neighbor search using Thrust sort on cell-coherent data. - // In Parallel: - // - Label each particle with its array index as well as its grid index. - // Use 2x width grids - // - Unstable key sort using Thrust. A stable sort isn't necessary, but you - // are welcome to do a performance comparison. - // - Naively unroll the loop for finding the start and end indices of each - // cell's data pointers in the array of boid indices - // - BIG DIFFERENCE: use the rearranged array index buffer to reshuffle all - // the particle data in the simulation array. - // CONSIDER WHAT ADDITIONAL BUFFERS YOU NEED - // - Perform velocity updates using neighbor search - // - Update positions - // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE. + // TODO-2.3 - start by copying Boids::stepSimulationNaiveGrid + // Uniform Grid Neighbor search using Thrust sort on cell-coherent data. + + dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize); + + // In Parallel: + // - label each particle with its array index as well as its grid index. + // Use 2x width grids. + kernComputeIndices << > > (numObjects, + gridSideCount, + gridMinimum, + gridInverseCellWidth, + dev_pos, + dev_particleArrayIndices, + dev_particleGridIndices); + + // - Unstable key sort using Thrust. A stable sort isn't necessary, but you + // are welcome to do a performance comparison. + thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices); + + // - Naively unroll the loop for finding the start and end indices of each + // cell's data pointers in the array of boid indices + dim3 fullBlocksPerGridCell((gridCellCount + blockSize - 1) / blockSize); + // Reset the startIndex array + kernResetIntBuffer << > > (gridCellCount, + dev_gridCellStartIndices, + -1); + + kernIdentifyCellStartEnd << > > (numObjects, + dev_particleGridIndices, + dev_gridCellStartIndices, + dev_gridCellEndIndices); + + // - BIG DIFFERENCE: use the rearranged array index buffer to reshuffle all + // the particle data in the simulation array. + // CONSIDER WHAT ADDITIONAL BUFFERS YOU NEED + kernReshuffleBuffer << > > (numObjects, + dev_particleArrayIndices, + dev_pos, + dev_sorted_pos, + dev_vel1, + dev_vel2); + + // - Perform velocity updates using neighbor search + kernUpdateVelNeighborSearchCoherent << > > (numObjects, + gridSideCount, + gridMinimum, + gridInverseCellWidth, + gridCellWidth, + dev_gridCellStartIndices, + dev_gridCellEndIndices, + dev_sorted_pos, + dev_vel2, + dev_vel1); + + // - Update positions + kernUpdatePos << > > (numObjects, dt, dev_sorted_pos, dev_vel1); + + // - Ping-pong buffers as needed + cudaMemcpy(dev_vel2, dev_vel1, sizeof(glm::vec3) * numObjects, cudaMemcpyDeviceToDevice); + cudaMemcpy(dev_pos, dev_sorted_pos, sizeof(glm::vec3) * numObjects, cudaMemcpyDeviceToDevice); } void Boids::endSimulation() { - cudaFree(dev_vel1); - cudaFree(dev_vel2); - cudaFree(dev_pos); + cudaFree(dev_vel1); + cudaFree(dev_vel2); + cudaFree(dev_pos); - // TODO-2.1 TODO-2.3 - Free any additional buffers here. + // TODO-2.1 TODO-2.3 - Free any additional buffers here. + cudaFree(dev_particleArrayIndices); + cudaFree(dev_particleGridIndices); + cudaFree(dev_gridCellStartIndices); + cudaFree(dev_gridCellEndIndices); + + cudaFree(dev_sorted_pos); } void Boids::unitTest() { - // LOOK-1.2 Feel free to write additional tests here. - - // test unstable sort - int *dev_intKeys; - int *dev_intValues; - int N = 10; - - std::unique_ptrintKeys{ new int[N] }; - std::unique_ptrintValues{ new int[N] }; - - intKeys[0] = 0; intValues[0] = 0; - intKeys[1] = 1; intValues[1] = 1; - intKeys[2] = 0; intValues[2] = 2; - intKeys[3] = 3; intValues[3] = 3; - intKeys[4] = 0; intValues[4] = 4; - intKeys[5] = 2; intValues[5] = 5; - intKeys[6] = 2; intValues[6] = 6; - intKeys[7] = 0; intValues[7] = 7; - intKeys[8] = 5; intValues[8] = 8; - intKeys[9] = 6; intValues[9] = 9; - - cudaMalloc((void**)&dev_intKeys, N * sizeof(int)); - checkCUDAErrorWithLine("cudaMalloc dev_intKeys failed!"); - - cudaMalloc((void**)&dev_intValues, N * sizeof(int)); - checkCUDAErrorWithLine("cudaMalloc dev_intValues failed!"); - - dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize); - - std::cout << "before unstable sort: " << std::endl; - for (int i = 0; i < N; i++) { - std::cout << " key: " << intKeys[i]; - std::cout << " value: " << intValues[i] << std::endl; - } - - // How to copy data to the GPU - cudaMemcpy(dev_intKeys, intKeys.get(), sizeof(int) * N, cudaMemcpyHostToDevice); - cudaMemcpy(dev_intValues, intValues.get(), sizeof(int) * N, cudaMemcpyHostToDevice); - - // Wrap device vectors in thrust iterators for use with thrust. - thrust::device_ptr dev_thrust_keys(dev_intKeys); - thrust::device_ptr dev_thrust_values(dev_intValues); - // LOOK-2.1 Example for using thrust::sort_by_key - thrust::sort_by_key(dev_thrust_keys, dev_thrust_keys + N, dev_thrust_values); - - // How to copy data back to the CPU side from the GPU - cudaMemcpy(intKeys.get(), dev_intKeys, sizeof(int) * N, cudaMemcpyDeviceToHost); - cudaMemcpy(intValues.get(), dev_intValues, sizeof(int) * N, cudaMemcpyDeviceToHost); - checkCUDAErrorWithLine("memcpy back failed!"); - - std::cout << "after unstable sort: " << std::endl; - for (int i = 0; i < N; i++) { - std::cout << " key: " << intKeys[i]; - std::cout << " value: " << intValues[i] << std::endl; - } - - // cleanup - cudaFree(dev_intKeys); - cudaFree(dev_intValues); - checkCUDAErrorWithLine("cudaFree failed!"); - return; + // LOOK-1.2 Feel free to write additional tests here. + + // test unstable sort + int* dev_intKeys; + int* dev_intValues; + int N = 10; + + std::unique_ptrintKeys{ new int[N] }; + std::unique_ptrintValues{ new int[N] }; + + intKeys[0] = 0; intValues[0] = 0; + intKeys[1] = 1; intValues[1] = 1; + intKeys[2] = 0; intValues[2] = 2; + intKeys[3] = 3; intValues[3] = 3; + intKeys[4] = 0; intValues[4] = 4; + intKeys[5] = 2; intValues[5] = 5; + intKeys[6] = 2; intValues[6] = 6; + intKeys[7] = 0; intValues[7] = 7; + intKeys[8] = 5; intValues[8] = 8; + intKeys[9] = 6; intValues[9] = 9; + + cudaMalloc((void**)&dev_intKeys, N * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_intKeys failed!"); + + cudaMalloc((void**)&dev_intValues, N * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_intValues failed!"); + + dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize); + + std::cout << "before unstable sort: " << std::endl; + for (int i = 0; i < N; i++) { + std::cout << " key: " << intKeys[i]; + std::cout << " value: " << intValues[i] << std::endl; + } + + // How to copy data to the GPU + cudaMemcpy(dev_intKeys, intKeys.get(), sizeof(int) * N, cudaMemcpyHostToDevice); + cudaMemcpy(dev_intValues, intValues.get(), sizeof(int) * N, cudaMemcpyHostToDevice); + + // Wrap device vectors in thrust iterators for use with thrust. + thrust::device_ptr dev_thrust_keys(dev_intKeys); + thrust::device_ptr dev_thrust_values(dev_intValues); + // LOOK-2.1 Example for using thrust::sort_by_key + thrust::sort_by_key(dev_thrust_keys, dev_thrust_keys + N, dev_thrust_values); + + // How to copy data back to the CPU side from the GPU + cudaMemcpy(intKeys.get(), dev_intKeys, sizeof(int) * N, cudaMemcpyDeviceToHost); + cudaMemcpy(intValues.get(), dev_intValues, sizeof(int) * N, cudaMemcpyDeviceToHost); + checkCUDAErrorWithLine("memcpy back failed!"); + + std::cout << "after unstable sort: " << std::endl; + for (int i = 0; i < N; i++) { + std::cout << " key: " << intKeys[i]; + std::cout << " value: " << intValues[i] << std::endl; + } + + // cleanup + cudaFree(dev_intKeys); + cudaFree(dev_intValues); + checkCUDAErrorWithLine("cudaFree failed!"); + return; } diff --git a/src/main.cpp b/src/main.cpp index b82c8c6..46422e0 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -14,11 +14,11 @@ // LOOK-2.1 LOOK-2.3 - toggles for UNIFORM_GRID and COHERENT_GRID #define VISUALIZE 1 -#define UNIFORM_GRID 0 -#define COHERENT_GRID 0 +#define UNIFORM_GRID 1 +#define COHERENT_GRID 1 // LOOK-1.2 - change this to adjust particle count in the simulation -const int N_FOR_VIS = 5000; +const int N_FOR_VIS = 10000; const float DT = 0.2f; /** From 39500c8dadb8e56b1b230f071502d40dd52fab59 Mon Sep 17 00:00:00 2001 From: Wenqing Wang <33616958+FridaWang@users.noreply.github.com> Date: Thu, 8 Sep 2022 22:21:03 -0700 Subject: [PATCH 2/7] Update README.md --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index d63a6a1..9112e77 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,9 @@ **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 1 - Flocking** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +Wenqing Wang +[LinkedIn](https://www.linkedin.com/in/wenqingwang0910/) +* Tested on: Windows 11, i7-11370H @ 3.30GHz 16.0 GB, GTX 3050 Ti ### (TODO: Your README) From fb7e53e2c3045b807d7a52b939a3aea745912453 Mon Sep 17 00:00:00 2001 From: Wenqing Wang <33616958+FridaWang@users.noreply.github.com> Date: Sun, 11 Sep 2022 17:41:11 -0400 Subject: [PATCH 3/7] Update README.md --- README.md | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 9112e77..084cef1 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,27 @@ **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 1 - Flocking** -Wenqing Wang -[LinkedIn](https://www.linkedin.com/in/wenqingwang0910/) +* Wenqing Wang + * [LinkedIn](https://www.linkedin.com/in/wenqingwang0910/) * Tested on: Windows 11, i7-11370H @ 3.30GHz 16.0 GB, GTX 3050 Ti -### (TODO: Your README) +## Screenshots +* Coherent with 10K boids +![1](https://user-images.githubusercontent.com/33616958/189548869-6924fda7-1c0e-4308-952d-dffebb1ec029.gif) +* Coherent with 20K boids +![2](https://user-images.githubusercontent.com/33616958/189548865-61dd7752-f4a0-45aa-a383-f948aa85e920.gif) +* Coherent with 80K boids +![8](https://user-images.githubusercontent.com/33616958/189548868-0981d6cb-dec3-4cd5-b6ca-efe10a81c999.gif) + + + +## Performance Analysis +From the plots below, we can see that for all 3 methods, the average frame per second decreases as the # of boids increase. That's becasuse we'll need to process more data as the # of boids increases. When we switch from the naive method to the uniform grid search, the performance improves because instead of performing a brute force search to check every rule for every 2 boids, we check the 27 neighbor cells of each boid, which greatly reduces the simulation effort. The performance was further improved after we optimized the data access method in the coherent method. +![fps_w_v](https://user-images.githubusercontent.com/33616958/189547897-78ed6b50-76d0-4bb7-90e3-e1e491814548.png) + +After disabling visualization, the framerates reported below are for the the simulation only: +![fps_wo_v](https://user-images.githubusercontent.com/33616958/189547898-3ca487ae-1ada-4b53-90f0-550108a8399c.png) + +Also, it seems that changing the block size doesn't have much impact on performance. I think this is because it does not affect the total data we need to process or the number of threads needed to process it. +![fps_w_blocksize](https://user-images.githubusercontent.com/33616958/189547900-52a10a80-40e4-4ddc-af58-9eb90d97be9c.png) -Include screenshots, analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) From 9f898538a0c45edb51088ae4972a2b64fd825024 Mon Sep 17 00:00:00 2001 From: FridaWang <379788704@qq.com> Date: Sun, 11 Sep 2022 17:42:45 -0400 Subject: [PATCH 4/7] tiny fix --- src/kernel.cu | 2 +- src/main.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/kernel.cu b/src/kernel.cu index c7002e1..cc3a31d 100644 --- a/src/kernel.cu +++ b/src/kernel.cu @@ -158,7 +158,7 @@ void Boids::initSimulation(int N) { checkCUDAErrorWithLine("kernGenerateRandomPosArray failed!"); // LOOK-2.1 computing grid params - gridCellWidth = 2.0f * std::max(std::max(rule1Distance, rule2Distance), rule3Distance); + gridCellWidth = 1.0f * std::max(std::max(rule1Distance, rule2Distance), rule3Distance); int halfSideCount = (int)(scene_scale / gridCellWidth) + 1; gridSideCount = 2 * halfSideCount; diff --git a/src/main.cpp b/src/main.cpp index 46422e0..8a53937 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -18,7 +18,7 @@ #define COHERENT_GRID 1 // LOOK-1.2 - change this to adjust particle count in the simulation -const int N_FOR_VIS = 10000; +const int N_FOR_VIS = 20000; const float DT = 0.2f; /** From f4f8f2cdeb0b0337a4bacdda83868ccc999a6a4b Mon Sep 17 00:00:00 2001 From: Wenqing Wang <33616958+FridaWang@users.noreply.github.com> Date: Sun, 11 Sep 2022 20:31:31 -0400 Subject: [PATCH 5/7] Update README.md --- README.md | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 084cef1..eb1fca7 100644 --- a/README.md +++ b/README.md @@ -16,12 +16,19 @@ Project 1 - Flocking** ## Performance Analysis -From the plots below, we can see that for all 3 methods, the average frame per second decreases as the # of boids increase. That's becasuse we'll need to process more data as the # of boids increases. When we switch from the naive method to the uniform grid search, the performance improves because instead of performing a brute force search to check every rule for every 2 boids, we check the 27 neighbor cells of each boid, which greatly reduces the simulation effort. The performance was further improved after we optimized the data access method in the coherent method. ![fps_w_v](https://user-images.githubusercontent.com/33616958/189547897-78ed6b50-76d0-4bb7-90e3-e1e491814548.png) After disabling visualization, the framerates reported below are for the the simulation only: ![fps_wo_v](https://user-images.githubusercontent.com/33616958/189547898-3ca487ae-1ada-4b53-90f0-550108a8399c.png) -Also, it seems that changing the block size doesn't have much impact on performance. I think this is because it does not affect the total data we need to process or the number of threads needed to process it. ![fps_w_blocksize](https://user-images.githubusercontent.com/33616958/189547900-52a10a80-40e4-4ddc-af58-9eb90d97be9c.png) +* Questions +1. For each implementation, how does changing the number of boids affect performance? Why do you think this is? + - From the plots above, we can see that for all 3 methods, the average frame per second decreases as the # of boids increase. That's becasuse we'll need to process more data as the # of boids increases. When we switch from the naive method to the uniform grid search, the performance improves because instead of performing a brute force search to check every rule for every 2 boids, we check the 27 neighbor cells of each boid, which greatly reduces the simulation effort. +2. For each implementation, how does changing the block count and block size affect performance? Why do you think this is? + - It seems that changing the block size doesn't have much impact on performance (at least no clear pattern). I think this is because it does not affect the total data we need to process or the number of threads needed to process these data. +3. For the coherent uniform grid: did you experience any performance improvements with the more coherent uniform grid? Was this the outcome you expected? Why or why not? + - Yes, the performance imporves as we reranging the data buffer in the coherent uniform grid method. This is because we no longer need to get the boid index from the `dev_particleArrayIndices` buffer, which reduces the data access operations. +4. Did changing cell width and checking 27 vs 8 neighboring cells affect performance? Why or why not? Be careful: it is insufficient (and possibly incorrect) to say that 27-cell is slower simply because there are more cells to check! + - Changing the cell width of the uniform grid to be the neighborhood distance and check 27 cells instead of 8 improve the performance on my laptop. I suspect this is because although we checked more cells, since we reduced the cell width of the uniform grid to half the original size, we actually checked a smaller volume (contains less boids) each time. From 5d4b5518a5e96a65555cd9863f5ab517ed54d36d Mon Sep 17 00:00:00 2001 From: Wenqing Wang <33616958+FridaWang@users.noreply.github.com> Date: Wed, 14 Sep 2022 17:19:44 -0400 Subject: [PATCH 6/7] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index eb1fca7..5f5c1ea 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,6 @@ **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 1 - Flocking** +====================== * Wenqing Wang * [LinkedIn](https://www.linkedin.com/in/wenqingwang0910/) From 56fcfd2c890a27972359162dbba8ece3d87528c5 Mon Sep 17 00:00:00 2001 From: Wenqing Wang <33616958+FridaWang@users.noreply.github.com> Date: Sun, 18 Sep 2022 20:57:33 -0400 Subject: [PATCH 7/7] Update README.md --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 5f5c1ea..be1a3e7 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,8 @@ -**University of Pennsylvania, CIS 565: GPU Programming and Architecture, -Project 1 - Flocking** +Flocking ====================== +**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** + * Wenqing Wang * [LinkedIn](https://www.linkedin.com/in/wenqingwang0910/) * Tested on: Windows 11, i7-11370H @ 3.30GHz 16.0 GB, GTX 3050 Ti