diff --git a/README.md b/README.md
index d63a6a1..b9c2e0c 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,37 @@
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture,
 Project 1 - Flocking**
+### Screenshots
+* 5000 Boids
+![](images/b-5000.gif)
+* 50000 Boids
+![](images/b-50000.gif)
+* 500000 Boids
+![](images/b-500000.gif)
+* 1000000 Boids
+![](images/b-1000000.gif)
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Haoquan Liang
+  * [LinkedIn](https://www.linkedin.com/in/leohaoquanliang/)
+* Tested on: Windows 10, Ryzen 7 5800X 8 Core 3.80 GHz, NVIDIA GeForce RTX 3080 Ti 12 GB
 
-### (TODO: Your README)
+### Extra Feature
+* Grid-Looping Optimization: change searchRadius in kernUpdateVelNeighborSearchScattered() and kernUpdateVelNeighborSearchCoherent() to easily change the cell search radius. 2 for 8 and 3 for 27. 
 
-Include screenshots, analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+### Performance Analysis
+#### Average Frame Rate vs. Number of Boids (with visualization)
+![](images/pa1.png)
+#### Average Frame Rate vs. Number of Boids (without visualization)
+![](images/pa2.png)
+#### Average Frame Rate vs. Block Size (Coherent Grid, 50000 boids, visualization on)
+![](images/pa3.png)
+
+
+### Questions and Answers
+#### For each implementation, how does changing the number of boids affect performance? Why do you think this is?    
+* The more boids to compute, the lowerer the performance. This is expected since if the number of boids are greater than the number of available GPU cores/threads, it will not be able to compute the result concurrently. If it needs more cycles to finish the computation, the performance will be worse.
+#### For each implementation, how does changing the block count and block size affect performance? Why do you think this is?    
+* From my result, the number of blocks/block size doesn't seem to affect the performance notably. It could simply be because all the block size I tested can fully utilize all the threads available on the GPU.
+#### For the coherent uniform grid: did you experience any performance improvements with the more coherent uniform grid? Was this the outcome you expected? Why or why not?    
+* Yes, I expect a performance, but not a significant one. However, according to my result, when the number of boids is less than  50000, coherent grid is only slightly better than the scattered grid. When there are more boids, the difference is significant! I did not expect this at all and I think it's because that when there are more boids, the cost of transferring the data (middle man) also increases, and so does the benefit of cutting down the middle man. 
+#### Did changing cell width and checking 27 vs 8 neighboring cells affect performance? Why or why not? Be careful: it is insufficient (and possibly incorrect) to say that 27-cell is slower simply because there are more cells to check!    
+* Yes, this will affect the performance. When the number of boids is not too large, the difference should be very small. However, if there are more boids, the cost of checking each cell and the cost of data transferring for each thread will increase significantly, which will noticeably affect the performance. 
\ No newline at end of file
diff --git a/images/b-1000000.gif b/images/b-1000000.gif
new file mode 100644
index 0000000..5ee4bc9
Binary files /dev/null and b/images/b-1000000.gif differ
diff --git a/images/b-234567.gif b/images/b-234567.gif
new file mode 100644
index 0000000..64f3867
Binary files /dev/null and b/images/b-234567.gif differ
diff --git a/images/b-5000.gif b/images/b-5000.gif
new file mode 100644
index 0000000..b95e187
Binary files /dev/null and b/images/b-5000.gif differ
diff --git a/images/b-50000.gif b/images/b-50000.gif
new file mode 100644
index 0000000..97617e7
Binary files /dev/null and b/images/b-50000.gif differ
diff --git a/images/b-500000.gif b/images/b-500000.gif
new file mode 100644
index 0000000..5417250
Binary files /dev/null and b/images/b-500000.gif differ
diff --git a/images/b-5000000.gif b/images/b-5000000.gif
new file mode 100644
index 0000000..3ba36d8
Binary files /dev/null and b/images/b-5000000.gif differ
diff --git a/images/pa1.png b/images/pa1.png
new file mode 100644
index 0000000..e9e1787
Binary files /dev/null and b/images/pa1.png differ
diff --git a/images/pa2.png b/images/pa2.png
new file mode 100644
index 0000000..5075b30
Binary files /dev/null and b/images/pa2.png differ
diff --git a/images/pa3.png b/images/pa3.png
new file mode 100644
index 0000000..57dd741
Binary files /dev/null and b/images/pa3.png differ
diff --git a/src/kernel.cu b/src/kernel.cu
index 74dffcb..907526b 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -85,6 +85,8 @@ int *dev_gridCellEndIndices;   // to this cell?
 
 // TODO-2.3 - consider what additional buffers you might need to reshuffle
 // the position and velocity data to be coherent within cells.
+glm::vec3* dev_pos_buf;
+glm::vec3* dev_vel_buf;
 
 // LOOK-2.1 - Grid parameters based on simulation parameters.
 // These are automatically computed for you in Boids::initSimulation
@@ -169,6 +171,27 @@ void Boids::initSimulation(int N) {
   gridMinimum.z -= halfGridWidth;
 
   // TODO-2.1 TODO-2.3 - Allocate additional buffers here.
+  cudaMalloc((void**)&dev_particleArrayIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleArrayIndices failed!");
+
+  cudaMalloc((void**)&dev_particleGridIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleGridIndices failed!");
+
+  cudaMalloc((void**)&dev_gridCellStartIndices, gridCellCount*sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellStartIndices failed!");
+
+  cudaMalloc((void**)&dev_gridCellEndIndices, gridCellCount*sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellEndIndices failed!");
+  
+  dev_thrust_particleGridIndices = thrust::device_ptr<int>(dev_particleGridIndices);
+  dev_thrust_particleArrayIndices = thrust::device_ptr<int>(dev_particleArrayIndices);
+
+  cudaMalloc((void**)&dev_pos_buf, N * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("cudaMalloc dev_pos failed!");
+
+  cudaMalloc((void**)&dev_vel_buf, N * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("cudaMalloc dev_vel1 failed!");
+
   cudaDeviceSynchronize();
 }
 
@@ -233,7 +256,50 @@ __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *po
   // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
   // Rule 2: boids try to stay a distance d away from each other
   // Rule 3: boids try to match the speed of surrounding boids
-  return glm::vec3(0.0f, 0.0f, 0.0f);
+
+    glm::vec3 pc = glm::vec3(0.f, 0.f, 0.f);   // perceived center of mass
+    glm::vec3 c = glm::vec3(0.f, 0.f, 0.f);     // displacement to avoid collision
+    glm::vec3 pv = glm::vec3(0.f, 0.f, 0.f);   // perceived velocity of mass
+    float pc_count = 0, pv_count = 0;
+
+    for (int i = 0; i < N; ++i)
+    {
+        if (i != iSelf)
+        {
+            float dist = glm::distance(pos[i], pos[iSelf]);
+            // For rule 1
+            if (dist < rule1Distance)
+            {
+                pc += pos[i];
+                pc_count++;
+            }
+            // For rule 2
+            if (dist < rule2Distance)
+            {
+                c -= (pos[i] - pos[iSelf]);
+            }
+            // For rule 3
+            if (dist < rule3Distance)
+            {
+                pv += vel[i];
+                pv_count++;
+            }
+        }
+    }
+    glm::vec3 v1 = glm::vec3(0.f, 0.f, 0.f);
+    if (pc_count > 0)
+    {
+        pc /= pc_count;
+        v1 = (pc - pos[iSelf]) * rule1Scale; // velocity change from rule 1
+    }
+    glm::vec3 v2 = c * rule2Scale;                           // velocity change from rule 2
+    glm::vec3 v3 = glm::vec3(0.f, 0.f, 0.f);
+    if (pv_count > 0)
+    {
+        pv /= pv_count;
+        v3 = pv * rule3Scale;    // velocity change from rule 3
+    }
+  return v1 + v2 + v3;
 }
 
 /**
@@ -243,8 +309,15 @@ __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *po
 __global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos,
   glm::vec3 *vel1, glm::vec3 *vel2) {
   // Compute a new velocity based on pos and vel1
+    int index = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (index >= N) {
+        return;
+    }
+    glm::vec3 newVel = vel1[index] + computeVelocityChange(N, index, pos, vel1);
   // Clamp the speed
   // Record the new velocity into vel2. Question: why NOT vel1?
+    vel2[index] = glm::clamp(newVel, -1 * maxSpeed, maxSpeed);
+  // To answer why not vel1: For synchronization purposes. Other threads may still need to use the previous vel values
 }
 
 /**
@@ -289,6 +362,16 @@ __global__ void kernComputeIndices(int N, int gridResolution,
     // - Label each boid with the index of its grid cell.
     // - Set up a parallel array of integer indices as pointers to the actual
     //   boid data in pos and vel1/vel2
+    int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (index >= N) {
+        return;
+    }
+    glm::vec3 bpos = pos[index];
+    indices[index] = index;
+    glm::vec3 dist = bpos - gridMin;    // dist from current boid pos to gridMin
+    int gridIndex = gridIndex3Dto1D(glm::floor(dist.x * inverseCellWidth), glm::floor(dist.y * inverseCellWidth),
+        glm::floor(dist.z * inverseCellWidth), gridResolution);
+    gridIndices[index] = gridIndex;
 }
 
 // LOOK-2.1 Consider how this could be useful for indicating that a cell
@@ -306,6 +389,19 @@ __global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices,
   // Identify the start point of each cell in the gridIndices array.
   // This is basically a parallel unrolling of a loop that goes
   // "this index doesn't match the one before it, must be a new cell!"
+    int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (index >= N) {
+        return;
+    }
+
+    if (particleGridIndices[index - 1] != particleGridIndices[index])
+    {
+        gridCellStartIndices[particleGridIndices[index]] = index;
+        if (index != 0)
+        {
+            gridCellEndIndices[particleGridIndices[index - 1]] = index - 1;
+        }
+    }
 }
 
 __global__ void kernUpdateVelNeighborSearchScattered(
@@ -322,6 +418,121 @@ __global__ void kernUpdateVelNeighborSearchScattered(
   // - Access each boid in the cell and compute velocity change from
   //   the boids rules, if this boid is within the neighborhood distance.
   // - Clamp the speed change before putting the new speed in vel2
+    int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (index >= N) {
+        return;
+    }
+    int searchRadius = 2;
+
+    glm::vec3 bPos = pos[index];
+    glm::vec3 bVel = vel1[index];
+
+    glm::vec3 dist = bPos - gridMin;    // dist from current boid pos to gridMin
+//    glm::ivec3 gridIndex3 = glm::ivec3(glm::floor(dist.x * inverseCellWidth), glm::floor(dist.y * inverseCellWidth), glm::floor(dist.z * inverseCellWidth));
+    glm::ivec3 gridIndex3(round(dist.x * inverseCellWidth) , round(dist.y * inverseCellWidth), round(dist.z * inverseCellWidth));
+
+    int gridIndex = gridIndex3Dto1D(gridIndex3.x, gridIndex3.y, gridIndex3.z, gridResolution);
+    
+    int xDir, yDir, zDir;
+
+    // Checking closest side
+    if (dist.x > (gridIndex3.x + 1) * cellWidth + 0.5 * cellWidth)
+    {
+        xDir = 1;
+    }
+    else
+    {
+        xDir = -1;
+    }
+    if (dist.y > (gridIndex3.y + 1) * cellWidth + 0.5 * cellWidth)
+    {
+        yDir = 1;
+    }
+    else
+    {
+        yDir = -1;
+    }
+    if (dist.z > (gridIndex3.z + 1) * cellWidth + 0.5 * cellWidth)
+    {
+        zDir = 1;
+    }
+    else
+    {
+        zDir = -1;
+    }
+
+
+    float pc_count = 0, pv_count = 0;
+    glm::vec3 pc = glm::vec3(0.f, 0.f, 0.f);   // perceived center of mass
+    glm::vec3 c = glm::vec3(0.f, 0.f, 0.f);     // displacement to avoid collision
+    glm::vec3 pv = glm::vec3(0.f, 0.f, 0.f);   // perceived velocity of mass
+    
+    for (int z=gridIndex3.z; abs(z-gridIndex3.z) < searchRadius; z+=zDir)
+    { 
+        for (int y = gridIndex3.y; abs(y - gridIndex3.y) < searchRadius; y += yDir)
+        {
+            for (int x = gridIndex3.x; abs(x - gridIndex3.x) < searchRadius; x += xDir)
+            {
+                int indexToCheck = gridIndex3Dto1D(x, y, z, gridResolution);
+                if (indexToCheck >= 0 && indexToCheck < gridResolution * gridResolution * gridResolution)
+                {
+                    int startIndex = gridCellStartIndices[indexToCheck];
+                    int endIndex = gridCellEndIndices[indexToCheck];
+                    if (startIndex == -1 || endIndex == -1)
+                    {
+                        continue;
+                    }
+                    for (int j = startIndex; j <= endIndex; ++j)
+                    {
+                        int bIndex = particleArrayIndices[j];
+                        if (bIndex != index)
+                        {
+                            float dist = glm::distance(pos[bIndex], bPos);
+                            // For rule 1
+                            if (dist < rule1Distance)
+                            {
+                                pc += pos[bIndex];
+                                pc_count++;
+                            }
+                            // For rule 2
+                            if (dist < rule2Distance)
+                            {
+                                c -= (pos[bIndex] - bPos);
+                            }
+                            // For rule 3
+                            if (dist < rule3Distance)
+                            {
+                                pv += vel1[bIndex];
+                                pv_count++;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+
+
+    glm::vec3 v1 = glm::vec3(0.f, 0.f, 0.f);
+    if (pc_count > 0)
+    {
+        pc = pc / pc_count;
+        v1 = (pc - pos[index]) * rule1Scale; // velocity change from rule 1
+    }
+    glm::vec3 v2 = c * rule2Scale;                           // velocity change from rule 2
+    glm::vec3 v3 = glm::vec3(0.f, 0.f, 0.f);
+    if (pv_count > 0)
+    {
+        pv = pv / pv_count;
+        v3 = pv * rule3Scale;    // velocity change from rule 3
+    }
+
+    glm::vec3 newVel = bVel + v1 + v2 + v3;
+    
+    // Clamp the speed
+    // Record the new velocity into vel2. Question: why NOT vel1?
+    vel2[index] = glm::clamp(newVel, -1 * maxSpeed, maxSpeed);
 }
 
 __global__ void kernUpdateVelNeighborSearchCoherent(
@@ -341,6 +552,120 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
   // - Access each boid in the cell and compute velocity change from
   //   the boids rules, if this boid is within the neighborhood distance.
   // - Clamp the speed change before putting the new speed in vel2
+    int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (index >= N) {
+        return;
+    }
+
+    int searchRadius = 2;
+
+    glm::vec3 bPos = pos[index];
+    glm::vec3 bVel = vel1[index];
+
+    glm::vec3 dist = bPos - gridMin;    // dist from current boid pos to gridMin
+//    glm::ivec3 gridIndex3 = glm::ivec3(glm::floor(dist.x * inverseCellWidth), glm::floor(dist.y * inverseCellWidth), glm::floor(dist.z * inverseCellWidth));
+    glm::ivec3 gridIndex3(round(dist.x * inverseCellWidth), round(dist.y * inverseCellWidth), round(dist.z * inverseCellWidth));
+
+    int gridIndex = gridIndex3Dto1D(gridIndex3.x, gridIndex3.y, gridIndex3.z, gridResolution);
+
+    int xDir, yDir, zDir;
+
+    // Checking closest side
+    if (dist.x > (gridIndex3.x + 1) * cellWidth + 0.5 * cellWidth)
+    {
+        xDir = 1;
+    }
+    else
+    {
+        xDir = -1;
+    }
+    if (dist.y > (gridIndex3.y + 1) * cellWidth + 0.5 * cellWidth)
+    {
+        yDir = 1;
+    }
+    else
+    {
+        yDir = -1;
+    }
+    if (dist.z > (gridIndex3.z + 1) * cellWidth + 0.5 * cellWidth)
+    {
+        zDir = 1;
+    }
+    else
+    {
+        zDir = -1;
+    }
+
+    float pc_count = 0, pv_count = 0;
+    glm::vec3 pc = glm::vec3(0.f, 0.f, 0.f);   // perceived center of mass
+    glm::vec3 c = glm::vec3(0.f, 0.f, 0.f);     // displacement to avoid collision
+    glm::vec3 pv = glm::vec3(0.f, 0.f, 0.f);   // perceived velocity of mass
+
+    for (int z = gridIndex3.z; abs(z - gridIndex3.z) < searchRadius; z += zDir)
+    { 
+        for (int y = gridIndex3.y; abs(y - gridIndex3.y) < searchRadius; y += yDir)
+        { 
+            for (int x = gridIndex3.x; abs(x - gridIndex3.x) < searchRadius; x += xDir)
+            {
+                int indexToCheck = gridIndex3Dto1D(x, y, z, gridResolution);
+                if (indexToCheck >= 0 && indexToCheck < gridResolution * gridResolution * gridResolution)
+                {
+                    int startIndex = gridCellStartIndices[indexToCheck];
+                    int endIndex = gridCellEndIndices[indexToCheck];
+                    if (startIndex == -1 || endIndex == -1)
+                    {
+                        continue;
+                    }
+                    for (int j = startIndex; j <= endIndex; ++j)
+                    {
+                        if (j != index)
+                        {
+                            float dist = glm::distance(pos[j], bPos);
+                            // For rule 1
+                            if (dist < rule1Distance)
+                            {
+                                pc += pos[j];
+                                pc_count++;
+                            }
+                            // For rule 2
+                            if (dist < rule2Distance)
+                            {
+                                c -= (pos[j] - bPos);
+                            }
+                            // For rule 3
+                            if (dist < rule3Distance)
+                            {
+                                pv += vel1[j];
+                                pv_count++;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+
+
+    glm::vec3 v1 = glm::vec3(0.f, 0.f, 0.f);
+    if (pc_count > 0)
+    {
+        pc = pc / pc_count;
+        v1 = (pc - pos[index]) * rule1Scale; // velocity change from rule 1
+    }
+    glm::vec3 v2 = c * rule2Scale;                           // velocity change from rule 2
+    glm::vec3 v3 = glm::vec3(0.f, 0.f, 0.f);
+    if (pv_count > 0)
+    {
+        pv = pv / pv_count;
+        v3 = pv * rule3Scale;    // velocity change from rule 3
+    }
+
+    glm::vec3 newVel = bVel + v1 + v2 + v3;
+
+    // Clamp the speed
+    // Record the new velocity into vel2. Question: why NOT vel1?
+    vel2[index] = glm::clamp(newVel, -1 * maxSpeed, maxSpeed);
 }
 
 /**
@@ -348,7 +673,14 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
 */
 void Boids::stepSimulationNaive(float dt) {
   // TODO-1.2 - use the kernels you wrote to step the simulation forward in time.
+    dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+    kernUpdateVelocityBruteForce << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_pos,
+        dev_vel1, dev_vel2);
+    kernUpdatePos << <fullBlocksPerGrid, blockSize >> > (numObjects, dt, dev_pos, dev_vel2);
   // TODO-1.2 ping-pong the velocity buffers
+    glm::vec3* tmp = dev_vel1;
+    dev_vel1 = dev_vel2;
+    dev_vel2 = tmp;
 }
 
 void Boids::stepSimulationScatteredGrid(float dt) {
@@ -364,6 +696,36 @@ void Boids::stepSimulationScatteredGrid(float dt) {
   // - Perform velocity updates using neighbor search
   // - Update positions
   // - Ping-pong buffers as needed
+    dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+    dim3 cellCount((gridCellCount + blockSize - 1) / blockSize);
+
+    kernComputeIndices << <fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, dev_pos,
+        dev_particleArrayIndices, dev_particleGridIndices);
+
+    thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
+
+    kernResetIntBuffer << <cellCount, blockSize >> > (gridCellCount, dev_gridCellStartIndices, -1);
+    kernResetIntBuffer << <cellCount, blockSize >> > (gridCellCount, dev_gridCellEndIndices, -1);
+
+    kernIdentifyCellStartEnd << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+    kernUpdateVelNeighborSearchScattered << <fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth,
+        gridCellWidth, dev_gridCellStartIndices, dev_gridCellEndIndices, dev_particleArrayIndices, dev_pos, dev_vel1, dev_vel2);
+
+    // Update pos and ping-pong buffers are the same as above
+    kernUpdatePos << <fullBlocksPerGrid, blockSize >> > (numObjects, dt, dev_pos, dev_vel2);
+    glm::vec3* tmp = dev_vel1;
+    dev_vel1 = dev_vel2;
+    dev_vel2 = tmp;
+}  
+
+__global__ void kernRearrangeIndexBuffer(int N, int* particleArrayIndices, glm::vec3* pos, glm::vec3* vel1, glm::vec3* pos_buf, glm::vec3* vel_buf) 
+{
+    int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (index >= N) {
+        return;
+    }
+    pos_buf[index] = pos[particleArrayIndices[index]];
+    vel_buf[index] = vel1[particleArrayIndices[index]];
 }
 
 void Boids::stepSimulationCoherentGrid(float dt) {
@@ -382,6 +744,36 @@ void Boids::stepSimulationCoherentGrid(float dt) {
   // - Perform velocity updates using neighbor search
   // - Update positions
   // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE.
+    dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+    dim3 cellCount((gridCellCount + blockSize - 1) / blockSize);
+
+    kernComputeIndices << <fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, dev_pos,
+        dev_particleArrayIndices, dev_particleGridIndices);
+
+    thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
+    
+    kernResetIntBuffer << <cellCount, blockSize >> > (gridCellCount, dev_gridCellStartIndices, -1);
+    kernResetIntBuffer << <cellCount, blockSize >> > (gridCellCount, dev_gridCellEndIndices, -1);
+
+
+    kernIdentifyCellStartEnd << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+    
+
+    // Rearrange array index buffer
+    kernRearrangeIndexBuffer<< <fullBlocksPerGrid, blockSize >> >(numObjects, dev_particleArrayIndices, dev_pos, dev_vel1, dev_pos_buf, dev_vel_buf);
+    
+    kernUpdateVelNeighborSearchCoherent << <fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth,
+        gridCellWidth, dev_gridCellStartIndices, dev_gridCellEndIndices, dev_pos_buf, dev_vel_buf, dev_vel2);
+
+    // Update pos and ping-pong buffers are the same as above
+    kernUpdatePos << <fullBlocksPerGrid, blockSize >> > (numObjects, dt, dev_pos_buf, dev_vel2);
+    glm::vec3* tmp = dev_vel1;
+    dev_vel1 = dev_vel2;
+    dev_vel2 = tmp;
+
+    tmp = dev_pos;
+    dev_pos = dev_pos_buf;
+    dev_pos_buf = tmp;
 }
 
 void Boids::endSimulation() {
@@ -390,6 +782,13 @@ void Boids::endSimulation() {
   cudaFree(dev_pos);
 
   // TODO-2.1 TODO-2.3 - Free any additional buffers here.
+  cudaFree(dev_particleArrayIndices);
+  cudaFree(dev_particleGridIndices);
+  cudaFree(dev_gridCellStartIndices);
+  cudaFree(dev_gridCellEndIndices);
+
+  cudaFree(dev_vel_buf);
+  cudaFree(dev_pos_buf);
 }
 
 void Boids::unitTest() {
diff --git a/src/main.cpp b/src/main.cpp
index b82c8c6..3ef6dc7 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -14,8 +14,8 @@
 
 // LOOK-2.1 LOOK-2.3 - toggles for UNIFORM_GRID and COHERENT_GRID
 #define VISUALIZE 1
-#define UNIFORM_GRID 0
-#define COHERENT_GRID 0
+#define UNIFORM_GRID 1
+#define COHERENT_GRID 1
 
 // LOOK-1.2 - change this to adjust particle count in the simulation
 const int N_FOR_VIS = 5000;
@@ -114,7 +114,7 @@ bool init(int argc, char **argv) {
   updateCamera();
 
   initShaders(program);
-
+  
   glEnable(GL_DEPTH_TEST);
 
   return true;
@@ -216,6 +216,9 @@ void initShaders(GLuint * program) {
     double fps = 0;
     double timebase = 0;
     int frame = 0;
+    
+    int totalFrame = 0;
+    int secCount = 0;
 
     Boids::unitTest(); // LOOK-1.2 We run some basic example code to make sure
                        // your CUDA development setup is ready to go.
@@ -229,6 +232,10 @@ void initShaders(GLuint * program) {
       if (time - timebase > 1.0) {
         fps = frame / (time - timebase);
         timebase = time;
+        
+        totalFrame += frame;
+        secCount += 1;
+        
         frame = 0;
       }
 
@@ -258,6 +265,9 @@ void initShaders(GLuint * program) {
     }
     glfwDestroyWindow(window);
     glfwTerminate();
+
+    double endTime = glfwGetTime();
+    std::cout << "Average Frame: " << totalFrame / secCount << std::endl;
   }
 
 
@@ -269,6 +279,15 @@ void initShaders(GLuint * program) {
     if (key == GLFW_KEY_ESCAPE && action == GLFW_PRESS) {
       glfwSetWindowShouldClose(window, GL_TRUE);
     }
+    // P to pause, R to resume, change DT to non-const to enable this
+    /*
+    if (key == GLFW_KEY_P && action == GLFW_PRESS) {
+        DT = 0.f;
+    }
+    if (key == GLFW_KEY_R && action == GLFW_PRESS) {
+        DT = 0.2f;
+    }
+    */
   }
 
   void mouseButtonCallback(GLFWwindow* window, int button, int action, int mods) {
diff --git a/src/main.hpp b/src/main.hpp
index 88e9df7..3bcd96f 100644
--- a/src/main.hpp
+++ b/src/main.hpp
@@ -36,8 +36,8 @@ const float fovy = (float) (PI / 4);
 const float zNear = 0.10f;
 const float zFar = 10.0f;
 // LOOK-1.2: for high DPI displays, you may want to double these settings.
-int width = 1280;
-int height = 720;
+int width = 2560;
+int height = 1440;
 int pointSize = 2;
 
 // For camera controls