From 46fff41f3d35ba55131cce49e63a167561ea77e5 Mon Sep 17 00:00:00 2001
From: FridaWang <379788704@qq.com>
Date: Fri, 9 Sep 2022 01:19:49 -0400
Subject: [PATCH 1/7] finish part2.3

---
 src/kernel.cu | 833 +++++++++++++++++++++++++++++++++++---------------
 src/main.cpp  |   6 +-
 2 files changed, 584 insertions(+), 255 deletions(-)
diff --git a/src/kernel.cu b/src/kernel.cu
index 74dffcb..c7002e1 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -20,15 +20,15 @@
 /**
 * Check for CUDA errors; print and exit if there was a problem.
 */
-void checkCUDAError(const char *msg, int line = -1) {
-  cudaError_t err = cudaGetLastError();
-  if (cudaSuccess != err) {
-    if (line >= 0) {
-      fprintf(stderr, "Line %d: ", line);
+void checkCUDAError(const char* msg, int line = -1) {
+    cudaError_t err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        if (line >= 0) {
+            fprintf(stderr, "Line %d: ", line);
+        }
+        fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString(err));
+        exit(EXIT_FAILURE);
     }
-    fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString(err));
-    exit(EXIT_FAILURE);
-  }
 }
 
 
@@ -66,25 +66,26 @@ dim3 threadsPerBlock(blockSize);
 // Consider why you would need two velocity buffers in a simulation where each
 // boid cares about its neighbors' velocities.
 // These are called ping-pong buffers.
-glm::vec3 *dev_pos;
-glm::vec3 *dev_vel1;
-glm::vec3 *dev_vel2;
+glm::vec3* dev_pos;
+glm::vec3* dev_vel1;
+glm::vec3* dev_vel2;
 
 // LOOK-2.1 - these are NOT allocated for you. You'll have to set up the thrust
 // pointers on your own too.
 
 // For efficient sorting and the uniform grid. These should always be parallel.
-int *dev_particleArrayIndices; // What index in dev_pos and dev_velX represents this particle?
-int *dev_particleGridIndices; // What grid cell is this particle in?
+int* dev_particleArrayIndices; // What index in dev_pos and dev_velX represents this particle?
+int* dev_particleGridIndices; // What grid cell is this particle in?
 // needed for use with thrust
 thrust::device_ptr<int> dev_thrust_particleArrayIndices;
 thrust::device_ptr<int> dev_thrust_particleGridIndices;
 
-int *dev_gridCellStartIndices; // What part of dev_particleArrayIndices belongs
-int *dev_gridCellEndIndices;   // to this cell?
+int* dev_gridCellStartIndices; // What part of dev_particleArrayIndices belongs
+int* dev_gridCellEndIndices;   // to this cell?
 
 // TODO-2.3 - consider what additional buffers you might need to reshuffle
 // the position and velocity data to be coherent within cells.
+glm::vec3* dev_sorted_pos;
 
 // LOOK-2.1 - Grid parameters based on simulation parameters.
 // These are automatically computed for you in Boids::initSimulation
@@ -99,13 +100,13 @@ glm::vec3 gridMinimum;
 ******************/
 
 __host__ __device__ unsigned int hash(unsigned int a) {
-  a = (a + 0x7ed55d16) + (a << 12);
-  a = (a ^ 0xc761c23c) ^ (a >> 19);
-  a = (a + 0x165667b1) + (a << 5);
-  a = (a + 0xd3a2646c) ^ (a << 9);
-  a = (a + 0xfd7046c5) + (a << 3);
-  a = (a ^ 0xb55a4f09) ^ (a >> 16);
-  return a;
+    a = (a + 0x7ed55d16) + (a << 12);
+    a = (a ^ 0xc761c23c) ^ (a >> 19);
+    a = (a + 0x165667b1) + (a << 5);
+    a = (a + 0xd3a2646c) ^ (a << 9);
+    a = (a + 0xfd7046c5) + (a << 3);
+    a = (a ^ 0xb55a4f09) ^ (a >> 16);
+    return a;
 }
 
 /**
@@ -113,63 +114,73 @@ __host__ __device__ unsigned int hash(unsigned int a) {
 * Function for generating a random vec3.
 */
 __host__ __device__ glm::vec3 generateRandomVec3(float time, int index) {
-  thrust::default_random_engine rng(hash((int)(index * time)));
-  thrust::uniform_real_distribution<float> unitDistrib(-1, 1);
+    thrust::default_random_engine rng(hash((int)(index * time)));
+    thrust::uniform_real_distribution<float> unitDistrib(-1, 1);
 
-  return glm::vec3((float)unitDistrib(rng), (float)unitDistrib(rng), (float)unitDistrib(rng));
+    return glm::vec3((float)unitDistrib(rng), (float)unitDistrib(rng), (float)unitDistrib(rng));
 }
 
 /**
 * LOOK-1.2 - This is a basic CUDA kernel.
 * CUDA kernel for generating boids with a specified mass randomly around the star.
 */
-__global__ void kernGenerateRandomPosArray(int time, int N, glm::vec3 * arr, float scale) {
-  int index = (blockIdx.x * blockDim.x) + threadIdx.x;
-  if (index < N) {
-    glm::vec3 rand = generateRandomVec3(time, index);
-    arr[index].x = scale * rand.x;
-    arr[index].y = scale * rand.y;
-    arr[index].z = scale * rand.z;
-  }
+__global__ void kernGenerateRandomPosArray(int time, int N, glm::vec3* arr, float scale) {
+    int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (index < N) {
+        glm::vec3 rand = generateRandomVec3(time, index);
+        arr[index].x = scale * rand.x;
+        arr[index].y = scale * rand.y;
+        arr[index].z = scale * rand.z;
+    }
 }
 
 /**
 * Initialize memory, update some globals
 */
 void Boids::initSimulation(int N) {
-  numObjects = N;
-  dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
-
-  // LOOK-1.2 - This is basic CUDA memory management and error checking.
-  // Don't forget to cudaFree in  Boids::endSimulation.
-  cudaMalloc((void**)&dev_pos, N * sizeof(glm::vec3));
-  checkCUDAErrorWithLine("cudaMalloc dev_pos failed!");
-
-  cudaMalloc((void**)&dev_vel1, N * sizeof(glm::vec3));
-  checkCUDAErrorWithLine("cudaMalloc dev_vel1 failed!");
-
-  cudaMalloc((void**)&dev_vel2, N * sizeof(glm::vec3));
-  checkCUDAErrorWithLine("cudaMalloc dev_vel2 failed!");
-
-  // LOOK-1.2 - This is a typical CUDA kernel invocation.
-  kernGenerateRandomPosArray<<<fullBlocksPerGrid, blockSize>>>(1, numObjects,
-    dev_pos, scene_scale);
-  checkCUDAErrorWithLine("kernGenerateRandomPosArray failed!");
-
-  // LOOK-2.1 computing grid params
-  gridCellWidth = 2.0f * std::max(std::max(rule1Distance, rule2Distance), rule3Distance);
-  int halfSideCount = (int)(scene_scale / gridCellWidth) + 1;
-  gridSideCount = 2 * halfSideCount;
-
-  gridCellCount = gridSideCount * gridSideCount * gridSideCount;
-  gridInverseCellWidth = 1.0f / gridCellWidth;
-  float halfGridWidth = gridCellWidth * halfSideCount;
-  gridMinimum.x -= halfGridWidth;
-  gridMinimum.y -= halfGridWidth;
-  gridMinimum.z -= halfGridWidth;
-
-  // TODO-2.1 TODO-2.3 - Allocate additional buffers here.
-  cudaDeviceSynchronize();
+    numObjects = N;
+    dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
+
+    // LOOK-1.2 - This is basic CUDA memory management and error checking.
+    // Don't forget to cudaFree in  Boids::endSimulation.
+    cudaMalloc((void**)&dev_pos, N * sizeof(glm::vec3));
+    checkCUDAErrorWithLine("cudaMalloc dev_pos failed!");
+
+    cudaMalloc((void**)&dev_vel1, N * sizeof(glm::vec3));
+    checkCUDAErrorWithLine("cudaMalloc dev_vel1 failed!");
+
+    cudaMalloc((void**)&dev_vel2, N * sizeof(glm::vec3));
+    checkCUDAErrorWithLine("cudaMalloc dev_vel2 failed!");
+
+    // LOOK-1.2 - This is a typical CUDA kernel invocation.
+    kernGenerateRandomPosArray << <fullBlocksPerGrid, blockSize >> > (1, numObjects,
+        dev_pos, scene_scale);
+    checkCUDAErrorWithLine("kernGenerateRandomPosArray failed!");
+
+    // LOOK-2.1 computing grid params
+    gridCellWidth = 2.0f * std::max(std::max(rule1Distance, rule2Distance), rule3Distance);
+    int halfSideCount = (int)(scene_scale / gridCellWidth) + 1;
+    gridSideCount = 2 * halfSideCount;
+
+    gridCellCount = gridSideCount * gridSideCount * gridSideCount;
+    gridInverseCellWidth = 1.0f / gridCellWidth;
+    float halfGridWidth = gridCellWidth * halfSideCount;
+    gridMinimum.x -= halfGridWidth;
+    gridMinimum.y -= halfGridWidth;
+    gridMinimum.z -= halfGridWidth;
+
+    // TODO-2.1 TODO-2.3 - Allocate additional buffers here.
+    cudaMalloc((void**)&dev_particleArrayIndices, N * sizeof(int));
+    cudaMalloc((void**)&dev_particleGridIndices, N * sizeof(int));
+    cudaMalloc((void**)&dev_gridCellStartIndices, gridCellCount * sizeof(int));
+    cudaMalloc((void**)&dev_gridCellEndIndices, gridCellCount * sizeof(int));
+
+    dev_thrust_particleArrayIndices = thrust::device_ptr<int>(dev_particleArrayIndices);
+    dev_thrust_particleGridIndices = thrust::device_ptr<int>(dev_particleGridIndices);
+
+    cudaMalloc((void**)&dev_sorted_pos, N * sizeof(glm::vec3));
+
+    cudaDeviceSynchronize();
 }
 
 
@@ -180,42 +191,42 @@ void Boids::initSimulation(int N) {
 /**
 * Copy the boid positions into the VBO so that they can be drawn by OpenGL.
 */
-__global__ void kernCopyPositionsToVBO(int N, glm::vec3 *pos, float *vbo, float s_scale) {
-  int index = threadIdx.x + (blockIdx.x * blockDim.x);
+__global__ void kernCopyPositionsToVBO(int N, glm::vec3* pos, float* vbo, float s_scale) {
+    int index = threadIdx.x + (blockIdx.x * blockDim.x);
 
-  float c_scale = -1.0f / s_scale;
+    float c_scale = -1.0f / s_scale;
 
-  if (index < N) {
-    vbo[4 * index + 0] = pos[index].x * c_scale;
-    vbo[4 * index + 1] = pos[index].y * c_scale;
-    vbo[4 * index + 2] = pos[index].z * c_scale;
-    vbo[4 * index + 3] = 1.0f;
-  }
+    if (index < N) {
+        vbo[4 * index + 0] = pos[index].x * c_scale;
+        vbo[4 * index + 1] = pos[index].y * c_scale;
+        vbo[4 * index + 2] = pos[index].z * c_scale;
+        vbo[4 * index + 3] = 1.0f;
+    }
 }
 
-__global__ void kernCopyVelocitiesToVBO(int N, glm::vec3 *vel, float *vbo, float s_scale) {
-  int index = threadIdx.x + (blockIdx.x * blockDim.x);
+__global__ void kernCopyVelocitiesToVBO(int N, glm::vec3* vel, float* vbo, float s_scale) {
+    int index = threadIdx.x + (blockIdx.x * blockDim.x);
 
-  if (index < N) {
-    vbo[4 * index + 0] = vel[index].x + 0.3f;
-    vbo[4 * index + 1] = vel[index].y + 0.3f;
-    vbo[4 * index + 2] = vel[index].z + 0.3f;
-    vbo[4 * index + 3] = 1.0f;
-  }
+    if (index < N) {
+        vbo[4 * index + 0] = vel[index].x + 0.3f;
+        vbo[4 * index + 1] = vel[index].y + 0.3f;
+        vbo[4 * index + 2] = vel[index].z + 0.3f;
+        vbo[4 * index + 3] = 1.0f;
+    }
 }
 
 /**
 * Wrapper for call to the kernCopyboidsToVBO CUDA kernel.
 */
-void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities) {
-  dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+void Boids::copyBoidsToVBO(float* vbodptr_positions, float* vbodptr_velocities) {
+    dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
 
-  kernCopyPositionsToVBO << <fullBlocksPerGrid, blockSize >> >(numObjects, dev_pos, vbodptr_positions, scene_scale);
-  kernCopyVelocitiesToVBO << <fullBlocksPerGrid, blockSize >> >(numObjects, dev_vel1, vbodptr_velocities, scene_scale);
+    kernCopyPositionsToVBO << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_pos, vbodptr_positions, scene_scale);
+    kernCopyVelocitiesToVBO << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_vel1, vbodptr_velocities, scene_scale);
 
-  checkCUDAErrorWithLine("copyBoidsToVBO failed!");
+    checkCUDAErrorWithLine("copyBoidsToVBO failed!");
 
-  cudaDeviceSynchronize();
+    cudaDeviceSynchronize();
 }
 
 
@@ -229,47 +240,90 @@ void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities)
 * Compute the new velocity on the body with index `iSelf` due to the `N` boids
 * in the `pos` and `vel` arrays.
 */
-__device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *pos, const glm::vec3 *vel) {
-  // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
-  // Rule 2: boids try to stay a distance d away from each other
-  // Rule 3: boids try to match the speed of surrounding boids
-  return glm::vec3(0.0f, 0.0f, 0.0f);
+__device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3* pos, const glm::vec3* vel) {
+    // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
+    // Rule 2: boids try to stay a distance d away from each other
+    // Rule 3: boids try to match the speed of surrounding boids
+    glm::vec3 perceivedCenter(0.f, 0.f, 0.f);
+    glm::vec3 seperation(0.f, 0.f, 0.f);
+    glm::vec3 perceivedVel(0.f, 0.f, 0.f);
+
+    glm::vec3 dVel(0.f, 0.f, 0.f);
+
+    int numOfNeighborsRule1 = 0, numOfNeighborsRule3 = 0;
+
+    for (int i = 0; i < N; i++) {
+        if (i != iSelf && glm::distance(pos[i], pos[iSelf]) < rule1Distance) {
+            perceivedCenter += pos[i];
+            ++numOfNeighborsRule1;
+        }
+        if (i != iSelf && glm::distance(pos[i], pos[iSelf]) < rule2Distance) {
+            seperation -= (pos[i] - pos[iSelf]);
+        }
+        if (i != iSelf && glm::distance(pos[i], pos[iSelf]) < rule3Distance) {
+            perceivedVel += vel[i];
+            ++numOfNeighborsRule3;
+        }
+    }
+
+    if (numOfNeighborsRule1 > 0) {
+        perceivedCenter /= numOfNeighborsRule1;
+        dVel += (perceivedCenter - pos[iSelf]) * rule1Scale;
+    }
+
+    dVel += seperation * rule2Scale;
+
+    if (numOfNeighborsRule3 > 0) {
+        perceivedVel /= numOfNeighborsRule3;
+        dVel += perceivedVel * rule3Scale;
+    }
+    return dVel;
 }
 
 /**
 * TODO-1.2 implement basic flocking
 * For each of the `N` bodies, update its position based on its current velocity.
 */
-__global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos,
-  glm::vec3 *vel1, glm::vec3 *vel2) {
-  // Compute a new velocity based on pos and vel1
-  // Clamp the speed
-  // Record the new velocity into vel2. Question: why NOT vel1?
+__global__ void kernUpdateVelocityBruteForce(int N, glm::vec3* pos,
+    glm::vec3* vel1, glm::vec3* vel2) {
+    int index = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (index >= N) {
+        return;
+    }
+    // Compute a new velocity based on pos and vel1
+    glm::vec3 newVel = vel1[index] + computeVelocityChange(N, index, pos, vel1);
+    // Clamp the speed
+    float speed = glm::length(newVel);
+    if (speed > maxSpeed) {
+        newVel = newVel / speed * maxSpeed;
+    }
+    // Record the new velocity into vel2. Question: why NOT vel1?
+    vel2[index] = newVel;
 }
 
 /**
 * LOOK-1.2 Since this is pretty trivial, we implemented it for you.
 * For each of the `N` bodies, update its position based on its current velocity.
 */
-__global__ void kernUpdatePos(int N, float dt, glm::vec3 *pos, glm::vec3 *vel) {
-  // Update position by velocity
-  int index = threadIdx.x + (blockIdx.x * blockDim.x);
-  if (index >= N) {
-    return;
-  }
-  glm::vec3 thisPos = pos[index];
-  thisPos += vel[index] * dt;
+__global__ void kernUpdatePos(int N, float dt, glm::vec3* pos, glm::vec3* vel) {
+    // Update position by velocity
+    int index = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (index >= N) {
+        return;
+    }
+    glm::vec3 thisPos = pos[index];
+    thisPos += vel[index] * dt;
 
-  // Wrap the boids around so we don't lose them
-  thisPos.x = thisPos.x < -scene_scale ? scene_scale : thisPos.x;
-  thisPos.y = thisPos.y < -scene_scale ? scene_scale : thisPos.y;
-  thisPos.z = thisPos.z < -scene_scale ? scene_scale : thisPos.z;
+    // Wrap the boids around so we don't lose them
+    thisPos.x = thisPos.x < -scene_scale ? scene_scale : thisPos.x;
+    thisPos.y = thisPos.y < -scene_scale ? scene_scale : thisPos.y;
+    thisPos.z = thisPos.z < -scene_scale ? scene_scale : thisPos.z;
 
-  thisPos.x = thisPos.x > scene_scale ? -scene_scale : thisPos.x;
-  thisPos.y = thisPos.y > scene_scale ? -scene_scale : thisPos.y;
-  thisPos.z = thisPos.z > scene_scale ? -scene_scale : thisPos.z;
+    thisPos.x = thisPos.x > scene_scale ? -scene_scale : thisPos.x;
+    thisPos.y = thisPos.y > scene_scale ? -scene_scale : thisPos.y;
+    thisPos.z = thisPos.z > scene_scale ? -scene_scale : thisPos.z;
 
-  pos[index] = thisPos;
+    pos[index] = thisPos;
 }
 
 // LOOK-2.1 Consider this method of computing a 1D index from a 3D grid index.
@@ -279,179 +333,454 @@ __global__ void kernUpdatePos(int N, float dt, glm::vec3 *pos, glm::vec3 *vel) {
 //            for(y)
 //             for(z)? Or some other order?
 __device__ int gridIndex3Dto1D(int x, int y, int z, int gridResolution) {
-  return x + y * gridResolution + z * gridResolution * gridResolution;
+    return x + y * gridResolution + z * gridResolution * gridResolution;
 }
 
 __global__ void kernComputeIndices(int N, int gridResolution,
-  glm::vec3 gridMin, float inverseCellWidth,
-  glm::vec3 *pos, int *indices, int *gridIndices) {
+    glm::vec3 gridMin, float inverseCellWidth,
+    glm::vec3* pos, int* indices, int* gridIndices) {
     // TODO-2.1
     // - Label each boid with the index of its grid cell.
     // - Set up a parallel array of integer indices as pointers to the actual
     //   boid data in pos and vel1/vel2
+    int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (index >= N) {
+        return;
+    }
+
+    indices[index] = index;
+    glm::ivec3 gridIndex = glm::floor((pos[index] - gridMin) * inverseCellWidth);
+    gridIndices[index] = gridIndex3Dto1D(gridIndex.x, gridIndex.y, gridIndex.z, gridResolution);
 }
 
 // LOOK-2.1 Consider how this could be useful for indicating that a cell
 //          does not enclose any boids
-__global__ void kernResetIntBuffer(int N, int *intBuffer, int value) {
-  int index = (blockIdx.x * blockDim.x) + threadIdx.x;
-  if (index < N) {
-    intBuffer[index] = value;
-  }
+__global__ void kernResetIntBuffer(int N, int* intBuffer, int value) {
+    int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (index < N) {
+        intBuffer[index] = value;
+    }
 }
 
-__global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices,
-  int *gridCellStartIndices, int *gridCellEndIndices) {
-  // TODO-2.1
-  // Identify the start point of each cell in the gridIndices array.
-  // This is basically a parallel unrolling of a loop that goes
-  // "this index doesn't match the one before it, must be a new cell!"
+__global__ void kernIdentifyCellStartEnd(int N, int* particleGridIndices,
+    int* gridCellStartIndices, int* gridCellEndIndices) {
+    // TODO-2.1
+    // Identify the start point of each cell in the gridIndices array.
+    // This is basically a parallel unrolling of a loop that goes
+    // "this index doesn't match the one before it, must be a new cell!"
+    int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (index >= N) {
+        return;
+    }
+    // if first cell or index doesn't match the one before it, must be a start index
+    if (index == 0 || particleGridIndices[index] != particleGridIndices[index - 1]) {
+        gridCellStartIndices[particleGridIndices[index]] = index;
+    }
+    // if last cell or index doesn't match the one after it, must be a end index
+    if (index == N - 1 || particleGridIndices[index] != particleGridIndices[index + 1]) {
+        gridCellEndIndices[particleGridIndices[index]] = index;
+    }
 }
 
 __global__ void kernUpdateVelNeighborSearchScattered(
-  int N, int gridResolution, glm::vec3 gridMin,
-  float inverseCellWidth, float cellWidth,
-  int *gridCellStartIndices, int *gridCellEndIndices,
-  int *particleArrayIndices,
-  glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) {
-  // TODO-2.1 - Update a boid's velocity using the uniform grid to reduce
-  // the number of boids that need to be checked.
-  // - Identify the grid cell that this particle is in
-  // - Identify which cells may contain neighbors. This isn't always 8.
-  // - For each cell, read the start/end indices in the boid pointer array.
-  // - Access each boid in the cell and compute velocity change from
-  //   the boids rules, if this boid is within the neighborhood distance.
-  // - Clamp the speed change before putting the new speed in vel2
+    int N, int gridResolution, glm::vec3 gridMin,
+    float inverseCellWidth, float cellWidth,
+    int* gridCellStartIndices, int* gridCellEndIndices,
+    int* particleArrayIndices,
+    glm::vec3* pos, glm::vec3* vel1, glm::vec3* vel2) {
+    // TODO-2.1 - Update a boid's velocity using the uniform grid to reduce
+    // the number of boids that need to be checked.
+
+    int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (index >= N) {
+        return;
+    }
+
+    // - Identify the grid cell that this particle is in
+    int selfIdx = particleArrayIndices[index];
+    glm::vec3 selfPos = pos[selfIdx];
+    selfPos = glm::floor((selfPos - gridMin) * inverseCellWidth);
+
+    // - Identify which cells may contain neighbors. This isn't always 8.
+    int minX, maxX, minY, maxY, minZ, maxZ;
+
+    minX = imax(selfPos.x - 1, 0);
+    maxX = imin(selfPos.x + 1, gridResolution - 1);
+    minY = imax(selfPos.y - 1, 0);
+    maxY = imin(selfPos.y + 1, gridResolution - 1);
+    minZ = imax(selfPos.z - 1, 0);
+    maxZ = imin(selfPos.z + 1, gridResolution - 1);
+
+    glm::vec3 perceivedCenter(0.f, 0.f, 0.f);
+    glm::vec3 seperation(0.f, 0.f, 0.f);
+    glm::vec3 perceivedVel(0.f, 0.f, 0.f);
+
+    glm::vec3 dVel(0.f, 0.f, 0.f);
+
+    int numOfNeighborsRule1 = 0, numOfNeighborsRule3 = 0;
+
+    for (int z = minZ; z <= maxZ; ++z) {
+        for (int y = minY; y <= maxY; ++y) {
+            for (int x = minX; x <= maxX; ++x) {
+                int cellIdx = gridIndex3Dto1D(x, y, z, gridResolution);
+                // - For each cell, read the start/end indices in the boid pointer array.
+                int startIndex = gridCellStartIndices[cellIdx];
+                int endIndex = gridCellEndIndices[cellIdx];
+
+                if (startIndex != -1) {
+                    // - Access each boid in the cell and compute velocity change from
+                    //   the boids rules, if this boid is within the neighborhood distance.
+                    for (int i = startIndex; i <= endIndex; ++i) {
+                        int boidIdx = particleArrayIndices[i];
+                        if (boidIdx != selfIdx) {
+                            float distance = glm::distance(pos[boidIdx], pos[selfIdx]);
+                            if (distance < rule1Distance) {
+                                perceivedCenter += pos[boidIdx];
+                                ++numOfNeighborsRule1;
+                            }
+                            if (distance < rule2Distance) {
+                                seperation -= (pos[boidIdx] - pos[selfIdx]);
+                            }
+                            if (distance < rule3Distance) {
+                                perceivedVel += vel1[boidIdx];
+                                ++numOfNeighborsRule3;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    if (numOfNeighborsRule1 > 0) {
+        perceivedCenter /= numOfNeighborsRule1;
+        dVel += (perceivedCenter - pos[selfIdx]) * rule1Scale;
+    }
+
+    dVel += seperation * rule2Scale;
+
+    if (numOfNeighborsRule3 > 0) {
+        perceivedVel /= numOfNeighborsRule3;
+        dVel += perceivedVel * rule3Scale;
+    }
+
+    // - Clamp the speed change before putting the new speed in vel2
+    glm::vec3 tmpVel = vel1[selfIdx] + dVel;
+    float speed = glm::length(tmpVel);
+    if (speed > maxSpeed) {
+        tmpVel = tmpVel / speed * maxSpeed;
+    }
+
+    vel2[selfIdx] = tmpVel;
 }
 
+__global__ void kernReshuffleBuffer(int N, int* particleArrayIndices, glm::vec3* pos,
+    glm::vec3* sortedPos, glm::vec3* vel1, glm::vec3* vel2) {
+    int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (index >= N) {
+        return;
+    }
+    sortedPos[index] = pos[particleArrayIndices[index]];
+    vel2[index] = vel1[particleArrayIndices[index]];
+}
 __global__ void kernUpdateVelNeighborSearchCoherent(
-  int N, int gridResolution, glm::vec3 gridMin,
-  float inverseCellWidth, float cellWidth,
-  int *gridCellStartIndices, int *gridCellEndIndices,
-  glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) {
-  // TODO-2.3 - This should be very similar to kernUpdateVelNeighborSearchScattered,
-  // except with one less level of indirection.
-  // This should expect gridCellStartIndices and gridCellEndIndices to refer
-  // directly to pos and vel1.
-  // - Identify the grid cell that this particle is in
-  // - Identify which cells may contain neighbors. This isn't always 8.
-  // - For each cell, read the start/end indices in the boid pointer array.
-  //   DIFFERENCE: For best results, consider what order the cells should be
-  //   checked in to maximize the memory benefits of reordering the boids data.
-  // - Access each boid in the cell and compute velocity change from
-  //   the boids rules, if this boid is within the neighborhood distance.
-  // - Clamp the speed change before putting the new speed in vel2
+    int N, int gridResolution, glm::vec3 gridMin,
+    float inverseCellWidth, float cellWidth,
+    int* gridCellStartIndices, int* gridCellEndIndices,
+    glm::vec3* pos, glm::vec3* vel1, glm::vec3* vel2) {
+    // TODO-2.3 - This should be very similar to kernUpdateVelNeighborSearchScattered,
+    // except with one less level of indirection.
+    int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (index >= N) {
+        return;
+    }
+
+    // - Identify the grid cell that this particle is in
+    int selfIdx = index;
+    glm::vec3 selfPos = pos[selfIdx];
+    selfPos = glm::floor((selfPos - gridMin) * inverseCellWidth);
+
+    // - Identify which cells may contain neighbors. This isn't always 8.
+    int minX, maxX, minY, maxY, minZ, maxZ;
+
+    minX = imax(selfPos.x - 1, 0);
+    maxX = imin(selfPos.x + 1, gridResolution - 1);
+    minY = imax(selfPos.y - 1, 0);
+    maxY = imin(selfPos.y + 1, gridResolution - 1);
+    minZ = imax(selfPos.z - 1, 0);
+    maxZ = imin(selfPos.z + 1, gridResolution - 1);
+
+    glm::vec3 perceivedCenter(0.f, 0.f, 0.f);
+    glm::vec3 seperation(0.f, 0.f, 0.f);
+    glm::vec3 perceivedVel(0.f, 0.f, 0.f);
+
+    glm::vec3 dVel(0.f, 0.f, 0.f);
+
+    int numOfNeighborsRule1 = 0, numOfNeighborsRule3 = 0;
+
+    for (int z = minZ; z <= maxZ; ++z) {
+        for (int y = minY; y <= maxY; ++y) {
+            for (int x = minX; x <= maxX; ++x) {
+                int cellIdx = gridIndex3Dto1D(x, y, z, gridResolution);
+                // - For each cell, read the start/end indices in the boid pointer array.
+                int startIndex = gridCellStartIndices[cellIdx];
+                int endIndex = gridCellEndIndices[cellIdx];
+
+                if (startIndex != -1) {
+                    // - Access each boid in the cell and compute velocity change from
+                    //   the boids rules, if this boid is within the neighborhood distance.
+                    for (int i = startIndex; i <= endIndex; ++i) {
+                        int boidIdx = i;
+                        if (boidIdx != selfIdx) {
+                            float distance = glm::distance(pos[boidIdx], pos[selfIdx]);
+                            if (distance < rule1Distance) {
+                                perceivedCenter += pos[boidIdx];
+                                ++numOfNeighborsRule1;
+                            }
+                            if (distance < rule2Distance) {
+                                seperation -= (pos[boidIdx] - pos[selfIdx]);
+                            }
+                            if (distance < rule3Distance) {
+                                perceivedVel += vel1[boidIdx];
+                                ++numOfNeighborsRule3;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    if (numOfNeighborsRule1 > 0) {
+        perceivedCenter /= numOfNeighborsRule1;
+        dVel += (perceivedCenter - pos[selfIdx]) * rule1Scale;
+    }
+
+    dVel += seperation * rule2Scale;
+
+    if (numOfNeighborsRule3 > 0) {
+        perceivedVel /= numOfNeighborsRule3;
+        dVel += perceivedVel * rule3Scale;
+    }
+
+    // - Clamp the speed change before putting the new speed in vel2
+    glm::vec3 tmpVel = vel1[selfIdx] + dVel;
+    float speed = glm::length(tmpVel);
+    if (speed > maxSpeed) {
+        tmpVel = tmpVel / speed * maxSpeed;
+    }
+
+    vel2[selfIdx] = tmpVel;
 }
 
 /**
 * Step the entire N-body simulation by `dt` seconds.
 */
 void Boids::stepSimulationNaive(float dt) {
-  // TODO-1.2 - use the kernels you wrote to step the simulation forward in time.
-  // TODO-1.2 ping-pong the velocity buffers
+    // TODO-1.2 - use the kernels you wrote to step the simulation forward in time.
+    dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+
+    kernUpdateVelocityBruteForce << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_pos, dev_vel1, dev_vel2);
+    checkCUDAErrorWithLine("kernUpdateVelocityBruteForce failed!");
+
+    kernUpdatePos << <fullBlocksPerGrid, blockSize >> > (numObjects, dt, dev_pos, dev_vel2);
+    checkCUDAErrorWithLine("kernUpdatePos failed!");
+
+    // TODO-1.2 ping-pong the velocity buffers
+    cudaMemcpy(dev_vel1, dev_vel2, sizeof(glm::vec3) * numObjects, cudaMemcpyDeviceToDevice);
 }
 
 void Boids::stepSimulationScatteredGrid(float dt) {
-  // TODO-2.1
-  // Uniform Grid Neighbor search using Thrust sort.
-  // In Parallel:
-  // - label each particle with its array index as well as its grid index.
-  //   Use 2x width grids.
-  // - Unstable key sort using Thrust. A stable sort isn't necessary, but you
-  //   are welcome to do a performance comparison.
-  // - Naively unroll the loop for finding the start and end indices of each
-  //   cell's data pointers in the array of boid indices
-  // - Perform velocity updates using neighbor search
-  // - Update positions
-  // - Ping-pong buffers as needed
+    // TODO-2.1
+    // Uniform Grid Neighbor search using Thrust sort.
+    dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+
+    // In Parallel:
+    // - label each particle with its array index as well as its grid index.
+    //   Use 2x width grids.
+    kernComputeIndices << <fullBlocksPerGrid, blockSize >> > (numObjects,
+        gridSideCount,
+        gridMinimum,
+        gridInverseCellWidth,
+        dev_pos,
+        dev_particleArrayIndices,
+        dev_particleGridIndices);
+
+    // - Unstable key sort using Thrust. A stable sort isn't necessary, but you
+    //   are welcome to do a performance comparison.
+    thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
+
+    // - Naively unroll the loop for finding the start and end indices of each
+    //   cell's data pointers in the array of boid indices
+    dim3 fullBlocksPerGridCell((gridCellCount + blockSize - 1) / blockSize);
+    // Reset the startIndex array
+    kernResetIntBuffer << <fullBlocksPerGridCell, blockSize >> > (gridCellCount,
+        dev_gridCellStartIndices,
+        -1);
+
+    kernIdentifyCellStartEnd << <fullBlocksPerGrid, blockSize >> > (numObjects,
+        dev_particleGridIndices,
+        dev_gridCellStartIndices,
+        dev_gridCellEndIndices);
+
+    // - Perform velocity updates using neighbor search
+    kernUpdateVelNeighborSearchScattered << <fullBlocksPerGrid, blockSize >> > (numObjects,
+        gridSideCount,
+        gridMinimum,
+        gridInverseCellWidth,
+        gridCellWidth,
+        dev_gridCellStartIndices,
+        dev_gridCellEndIndices,
+        dev_particleArrayIndices,
+        dev_pos,
+        dev_vel1,
+        dev_vel2);
+
+    // - Update positions
+    kernUpdatePos << <fullBlocksPerGrid, blockSize >> > (numObjects, dt, dev_pos, dev_vel2);
+
+    // - Ping-pong buffers as needed
+    cudaMemcpy(dev_vel1, dev_vel2, sizeof(glm::vec3) * numObjects, cudaMemcpyDeviceToDevice);
 }
 
 void Boids::stepSimulationCoherentGrid(float dt) {
-  // TODO-2.3 - start by copying Boids::stepSimulationNaiveGrid
-  // Uniform Grid Neighbor search using Thrust sort on cell-coherent data.
-  // In Parallel:
-  // - Label each particle with its array index as well as its grid index.
-  //   Use 2x width grids
-  // - Unstable key sort using Thrust. A stable sort isn't necessary, but you
-  //   are welcome to do a performance comparison.
-  // - Naively unroll the loop for finding the start and end indices of each
-  //   cell's data pointers in the array of boid indices
-  // - BIG DIFFERENCE: use the rearranged array index buffer to reshuffle all
-  //   the particle data in the simulation array.
-  //   CONSIDER WHAT ADDITIONAL BUFFERS YOU NEED
-  // - Perform velocity updates using neighbor search
-  // - Update positions
-  // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE.
+    // TODO-2.3 - start by copying Boids::stepSimulationNaiveGrid
+    // Uniform Grid Neighbor search using Thrust sort on cell-coherent data.
+
+    dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+
+    // In Parallel:
+    // - label each particle with its array index as well as its grid index.
+    //   Use 2x width grids.
+    kernComputeIndices << <fullBlocksPerGrid, blockSize >> > (numObjects,
+        gridSideCount,
+        gridMinimum,
+        gridInverseCellWidth,
+        dev_pos,
+        dev_particleArrayIndices,
+        dev_particleGridIndices);
+
+    // - Unstable key sort using Thrust. A stable sort isn't necessary, but you
+    //   are welcome to do a performance comparison.
+    thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
+
+    // - Naively unroll the loop for finding the start and end indices of each
+    //   cell's data pointers in the array of boid indices
+    dim3 fullBlocksPerGridCell((gridCellCount + blockSize - 1) / blockSize);
+    // Reset the startIndex array
+    kernResetIntBuffer << <fullBlocksPerGridCell, blockSize >> > (gridCellCount,
+        dev_gridCellStartIndices,
+        -1);
+
+    kernIdentifyCellStartEnd << <fullBlocksPerGrid, blockSize >> > (numObjects,
+        dev_particleGridIndices,
+        dev_gridCellStartIndices,
+        dev_gridCellEndIndices);
+
+    // - BIG DIFFERENCE: use the rearranged array index buffer to reshuffle all
+    //   the particle data in the simulation array.
+    //   CONSIDER WHAT ADDITIONAL BUFFERS YOU NEED
+    kernReshuffleBuffer << <fullBlocksPerGrid, blockSize >> > (numObjects,
+        dev_particleArrayIndices, 
+        dev_pos, 
+        dev_sorted_pos, 
+        dev_vel1, 
+        dev_vel2);
+
+    // - Perform velocity updates using neighbor search
+    kernUpdateVelNeighborSearchCoherent << <fullBlocksPerGrid, blockSize >> > (numObjects,
+        gridSideCount,
+        gridMinimum,
+        gridInverseCellWidth,
+        gridCellWidth,
+        dev_gridCellStartIndices,
+        dev_gridCellEndIndices,
+        dev_sorted_pos,
+        dev_vel2,
+        dev_vel1);
+
+    // - Update positions
+    kernUpdatePos << <fullBlocksPerGrid, blockSize >> > (numObjects, dt, dev_sorted_pos, dev_vel1);
+
+    // - Ping-pong buffers as needed
+    cudaMemcpy(dev_vel2, dev_vel1, sizeof(glm::vec3) * numObjects, cudaMemcpyDeviceToDevice);
+    cudaMemcpy(dev_pos, dev_sorted_pos, sizeof(glm::vec3) * numObjects, cudaMemcpyDeviceToDevice);
 }
 
 void Boids::endSimulation() {
-  cudaFree(dev_vel1);
-  cudaFree(dev_vel2);
-  cudaFree(dev_pos);
+    cudaFree(dev_vel1);
+    cudaFree(dev_vel2);
+    cudaFree(dev_pos);
 
-  // TODO-2.1 TODO-2.3 - Free any additional buffers here.
+    // TODO-2.1 TODO-2.3 - Free any additional buffers here.
+    cudaFree(dev_particleArrayIndices);
+    cudaFree(dev_particleGridIndices);
+    cudaFree(dev_gridCellStartIndices);
+    cudaFree(dev_gridCellEndIndices);
+
+    cudaFree(dev_sorted_pos);
 }
 
 void Boids::unitTest() {
-  // LOOK-1.2 Feel free to write additional tests here.
-
-  // test unstable sort
-  int *dev_intKeys;
-  int *dev_intValues;
-  int N = 10;
-
-  std::unique_ptr<int[]>intKeys{ new int[N] };
-  std::unique_ptr<int[]>intValues{ new int[N] };
-
-  intKeys[0] = 0; intValues[0] = 0;
-  intKeys[1] = 1; intValues[1] = 1;
-  intKeys[2] = 0; intValues[2] = 2;
-  intKeys[3] = 3; intValues[3] = 3;
-  intKeys[4] = 0; intValues[4] = 4;
-  intKeys[5] = 2; intValues[5] = 5;
-  intKeys[6] = 2; intValues[6] = 6;
-  intKeys[7] = 0; intValues[7] = 7;
-  intKeys[8] = 5; intValues[8] = 8;
-  intKeys[9] = 6; intValues[9] = 9;
-
-  cudaMalloc((void**)&dev_intKeys, N * sizeof(int));
-  checkCUDAErrorWithLine("cudaMalloc dev_intKeys failed!");
-
-  cudaMalloc((void**)&dev_intValues, N * sizeof(int));
-  checkCUDAErrorWithLine("cudaMalloc dev_intValues failed!");
-
-  dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
-
-  std::cout << "before unstable sort: " << std::endl;
-  for (int i = 0; i < N; i++) {
-    std::cout << "  key: " << intKeys[i];
-    std::cout << " value: " << intValues[i] << std::endl;
-  }
-
-  // How to copy data to the GPU
-  cudaMemcpy(dev_intKeys, intKeys.get(), sizeof(int) * N, cudaMemcpyHostToDevice);
-  cudaMemcpy(dev_intValues, intValues.get(), sizeof(int) * N, cudaMemcpyHostToDevice);
-
-  // Wrap device vectors in thrust iterators for use with thrust.
-  thrust::device_ptr<int> dev_thrust_keys(dev_intKeys);
-  thrust::device_ptr<int> dev_thrust_values(dev_intValues);
-  // LOOK-2.1 Example for using thrust::sort_by_key
-  thrust::sort_by_key(dev_thrust_keys, dev_thrust_keys + N, dev_thrust_values);
-
-  // How to copy data back to the CPU side from the GPU
-  cudaMemcpy(intKeys.get(), dev_intKeys, sizeof(int) * N, cudaMemcpyDeviceToHost);
-  cudaMemcpy(intValues.get(), dev_intValues, sizeof(int) * N, cudaMemcpyDeviceToHost);
-  checkCUDAErrorWithLine("memcpy back failed!");
-
-  std::cout << "after unstable sort: " << std::endl;
-  for (int i = 0; i < N; i++) {
-    std::cout << "  key: " << intKeys[i];
-    std::cout << " value: " << intValues[i] << std::endl;
-  }
-
-  // cleanup
-  cudaFree(dev_intKeys);
-  cudaFree(dev_intValues);
-  checkCUDAErrorWithLine("cudaFree failed!");
-  return;
+    // LOOK-1.2 Feel free to write additional tests here.
+
+    // test unstable sort
+    int* dev_intKeys;
+    int* dev_intValues;
+    int N = 10;
+
+    std::unique_ptr<int[]>intKeys{ new int[N] };
+    std::unique_ptr<int[]>intValues{ new int[N] };
+
+    intKeys[0] = 0; intValues[0] = 0;
+    intKeys[1] = 1; intValues[1] = 1;
+    intKeys[2] = 0; intValues[2] = 2;
+    intKeys[3] = 3; intValues[3] = 3;
+    intKeys[4] = 0; intValues[4] = 4;
+    intKeys[5] = 2; intValues[5] = 5;
+    intKeys[6] = 2; intValues[6] = 6;
+    intKeys[7] = 0; intValues[7] = 7;
+    intKeys[8] = 5; intValues[8] = 8;
+    intKeys[9] = 6; intValues[9] = 9;
+
+    cudaMalloc((void**)&dev_intKeys, N * sizeof(int));
+    checkCUDAErrorWithLine("cudaMalloc dev_intKeys failed!");
+
+    cudaMalloc((void**)&dev_intValues, N * sizeof(int));
+    checkCUDAErrorWithLine("cudaMalloc dev_intValues failed!");
+
+    dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
+
+    std::cout << "before unstable sort: " << std::endl;
+    for (int i = 0; i < N; i++) {
+        std::cout << "  key: " << intKeys[i];
+        std::cout << " value: " << intValues[i] << std::endl;
+    }
+
+    // How to copy data to the GPU
+    cudaMemcpy(dev_intKeys, intKeys.get(), sizeof(int) * N, cudaMemcpyHostToDevice);
+    cudaMemcpy(dev_intValues, intValues.get(), sizeof(int) * N, cudaMemcpyHostToDevice);
+
+    // Wrap device vectors in thrust iterators for use with thrust.
+    thrust::device_ptr<int> dev_thrust_keys(dev_intKeys);
+    thrust::device_ptr<int> dev_thrust_values(dev_intValues);
+    // LOOK-2.1 Example for using thrust::sort_by_key
+    thrust::sort_by_key(dev_thrust_keys, dev_thrust_keys + N, dev_thrust_values);
+
+    // How to copy data back to the CPU side from the GPU
+    cudaMemcpy(intKeys.get(), dev_intKeys, sizeof(int) * N, cudaMemcpyDeviceToHost);
+    cudaMemcpy(intValues.get(), dev_intValues, sizeof(int) * N, cudaMemcpyDeviceToHost);
+    checkCUDAErrorWithLine("memcpy back failed!");
+
+    std::cout << "after unstable sort: " << std::endl;
+    for (int i = 0; i < N; i++) {
+        std::cout << "  key: " << intKeys[i];
+        std::cout << " value: " << intValues[i] << std::endl;
+    }
+
+    // cleanup
+    cudaFree(dev_intKeys);
+    cudaFree(dev_intValues);
+    checkCUDAErrorWithLine("cudaFree failed!");
+    return;
 }
diff --git a/src/main.cpp b/src/main.cpp
index b82c8c6..46422e0 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -14,11 +14,11 @@
 
 // LOOK-2.1 LOOK-2.3 - toggles for UNIFORM_GRID and COHERENT_GRID
 #define VISUALIZE 1
-#define UNIFORM_GRID 0
-#define COHERENT_GRID 0
+#define UNIFORM_GRID 1
+#define COHERENT_GRID 1
 
 // LOOK-1.2 - change this to adjust particle count in the simulation
-const int N_FOR_VIS = 5000;
+const int N_FOR_VIS = 10000;
 const float DT = 0.2f;
 
 /**

From 39500c8dadb8e56b1b230f071502d40dd52fab59 Mon Sep 17 00:00:00 2001
From: Wenqing Wang <33616958+FridaWang@users.noreply.github.com>
Date: Thu, 8 Sep 2022 22:21:03 -0700
Subject: [PATCH 2/7] Update README.md

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index d63a6a1..9112e77 100644
--- a/README.md
+++ b/README.md
@@ -1,9 +1,9 @@
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture,
 Project 1 - Flocking**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+Wenqing Wang
+[LinkedIn](https://www.linkedin.com/in/wenqingwang0910/)
+* Tested on: Windows 11, i7-11370H @ 3.30GHz 16.0 GB, GTX 3050 Ti
 
 ### (TODO: Your README)
 

From fb7e53e2c3045b807d7a52b939a3aea745912453 Mon Sep 17 00:00:00 2001
From: Wenqing Wang <33616958+FridaWang@users.noreply.github.com>
Date: Sun, 11 Sep 2022 17:41:11 -0400
Subject: [PATCH 3/7] Update README.md

---
 README.md | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 9112e77..084cef1 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,27 @@
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture,
 Project 1 - Flocking**
 
-Wenqing Wang
-[LinkedIn](https://www.linkedin.com/in/wenqingwang0910/)
+* Wenqing Wang
+  * [LinkedIn](https://www.linkedin.com/in/wenqingwang0910/) 
 * Tested on: Windows 11, i7-11370H @ 3.30GHz 16.0 GB, GTX 3050 Ti
 
-### (TODO: Your README)
+## Screenshots
+* Coherent with 10K boids
+![1](https://user-images.githubusercontent.com/33616958/189548869-6924fda7-1c0e-4308-952d-dffebb1ec029.gif)
+* Coherent with 20K boids
+![2](https://user-images.githubusercontent.com/33616958/189548865-61dd7752-f4a0-45aa-a383-f948aa85e920.gif)
+* Coherent with 80K boids
+![8](https://user-images.githubusercontent.com/33616958/189548868-0981d6cb-dec3-4cd5-b6ca-efe10a81c999.gif)
+
+
+
+## Performance Analysis
+From the plots below, we can see that for all 3 methods, the average frame per second decreases as the # of boids increase. That's becasuse we'll need to process more data as the # of boids increases. When we switch from the naive method to the uniform grid search, the performance improves because instead of performing a brute force search to check every rule for every 2 boids, we check the 27 neighbor cells of each boid, which greatly reduces the simulation effort. The performance was further improved after we optimized the data access method in the coherent method.
+![fps_w_v](https://user-images.githubusercontent.com/33616958/189547897-78ed6b50-76d0-4bb7-90e3-e1e491814548.png)
+
+After disabling visualization, the framerates reported below are for the the simulation only:
+![fps_wo_v](https://user-images.githubusercontent.com/33616958/189547898-3ca487ae-1ada-4b53-90f0-550108a8399c.png)
+
+Also, it seems that changing the block size doesn't have much impact on performance. I think this is because it does not affect the total data we need to process or the number of threads needed to process it.
+![fps_w_blocksize](https://user-images.githubusercontent.com/33616958/189547900-52a10a80-40e4-4ddc-af58-9eb90d97be9c.png)
 
-Include screenshots, analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)

From 9f898538a0c45edb51088ae4972a2b64fd825024 Mon Sep 17 00:00:00 2001
From: FridaWang <379788704@qq.com>
Date: Sun, 11 Sep 2022 17:42:45 -0400
Subject: [PATCH 4/7] tiny fix

---
 src/kernel.cu | 2 +-
 src/main.cpp  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/kernel.cu b/src/kernel.cu
index c7002e1..cc3a31d 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -158,7 +158,7 @@ void Boids::initSimulation(int N) {
     checkCUDAErrorWithLine("kernGenerateRandomPosArray failed!");
 
     // LOOK-2.1 computing grid params
-    gridCellWidth = 2.0f * std::max(std::max(rule1Distance, rule2Distance), rule3Distance);
+    gridCellWidth = 1.0f * std::max(std::max(rule1Distance, rule2Distance), rule3Distance);
     int halfSideCount = (int)(scene_scale / gridCellWidth) + 1;
     gridSideCount = 2 * halfSideCount;
 
diff --git a/src/main.cpp b/src/main.cpp
index 46422e0..8a53937 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -18,7 +18,7 @@
 #define COHERENT_GRID 1
 
 // LOOK-1.2 - change this to adjust particle count in the simulation
-const int N_FOR_VIS = 10000;
+const int N_FOR_VIS = 20000;
 const float DT = 0.2f;
 
 /**

From f4f8f2cdeb0b0337a4bacdda83868ccc999a6a4b Mon Sep 17 00:00:00 2001
From: Wenqing Wang <33616958+FridaWang@users.noreply.github.com>
Date: Sun, 11 Sep 2022 20:31:31 -0400
Subject: [PATCH 5/7] Update README.md

---
 README.md | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 084cef1..eb1fca7 100644
--- a/README.md
+++ b/README.md
@@ -16,12 +16,19 @@ Project 1 - Flocking**
 
 
 ## Performance Analysis
-From the plots below, we can see that for all 3 methods, the average frame per second decreases as the # of boids increase. That's becasuse we'll need to process more data as the # of boids increases. When we switch from the naive method to the uniform grid search, the performance improves because instead of performing a brute force search to check every rule for every 2 boids, we check the 27 neighbor cells of each boid, which greatly reduces the simulation effort. The performance was further improved after we optimized the data access method in the coherent method.
 ![fps_w_v](https://user-images.githubusercontent.com/33616958/189547897-78ed6b50-76d0-4bb7-90e3-e1e491814548.png)
 
 After disabling visualization, the framerates reported below are for the the simulation only:
 ![fps_wo_v](https://user-images.githubusercontent.com/33616958/189547898-3ca487ae-1ada-4b53-90f0-550108a8399c.png)
 
-Also, it seems that changing the block size doesn't have much impact on performance. I think this is because it does not affect the total data we need to process or the number of threads needed to process it.
 ![fps_w_blocksize](https://user-images.githubusercontent.com/33616958/189547900-52a10a80-40e4-4ddc-af58-9eb90d97be9c.png)
 
+* Questions 
+1. For each implementation, how does changing the number of boids affect performance? Why do you think this is?
+   - From the plots above, we can see that for all 3 methods, the average frame per second decreases as the # of boids increase. That's becasuse we'll need to process more data as the # of boids increases. When we switch from the naive method to the uniform grid search, the performance improves because instead of performing a brute force search to check every rule for every 2 boids, we check the 27 neighbor cells of each boid, which greatly reduces the simulation effort.
+2. For each implementation, how does changing the block count and block size affect performance? Why do you think this is?
+   - It seems that changing the block size doesn't have much impact on performance (at least no clear pattern). I think this is because it does not affect the total data we need to process or the number of threads needed to process these data.
+3. For the coherent uniform grid: did you experience any performance improvements with the more coherent uniform grid? Was this the outcome you expected? Why or why not?
+   - Yes, the performance imporves as we reranging the data buffer in the coherent uniform grid method. This is because we no longer need to get the boid index from the `dev_particleArrayIndices` buffer, which reduces the data access operations.
+4. Did changing cell width and checking 27 vs 8 neighboring cells affect performance? Why or why not? Be careful: it is insufficient (and possibly incorrect) to say that 27-cell is slower simply because there are more cells to check!
+   - Changing the cell width of the uniform grid to be the neighborhood distance and check 27 cells instead of 8 improve the performance on my laptop. I suspect this is because although we checked more cells, since we reduced the cell width of the uniform grid to half the original size, we actually checked a smaller volume (contains less boids) each time.

From 5d4b5518a5e96a65555cd9863f5ab517ed54d36d Mon Sep 17 00:00:00 2001
From: Wenqing Wang <33616958+FridaWang@users.noreply.github.com>
Date: Wed, 14 Sep 2022 17:19:44 -0400
Subject: [PATCH 6/7] Update README.md

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index eb1fca7..5f5c1ea 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,6 @@
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture,
 Project 1 - Flocking**
+======================
 
 * Wenqing Wang
   * [LinkedIn](https://www.linkedin.com/in/wenqingwang0910/) 

From 56fcfd2c890a27972359162dbba8ece3d87528c5 Mon Sep 17 00:00:00 2001
From: Wenqing Wang <33616958+FridaWang@users.noreply.github.com>
Date: Sun, 18 Sep 2022 20:57:33 -0400
Subject: [PATCH 7/7] Update README.md

---
 README.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 5f5c1ea..be1a3e7 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,8 @@
-**University of Pennsylvania, CIS 565: GPU Programming and Architecture,
-Project 1 - Flocking**
+Flocking
 ======================
 
+**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
+
 * Wenqing Wang
   * [LinkedIn](https://www.linkedin.com/in/wenqingwang0910/) 
 * Tested on: Windows 11, i7-11370H @ 3.30GHz 16.0 GB, GTX 3050 Ti