diff --git a/README.md b/README.md
index d63a6a1..fd01013 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,64 @@
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture,
 Project 1 - Flocking**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
 
-### (TODO: Your README)
+* Yilin Liu
+  * [LinkedIn](https://www.linkedin.com/in/yilin-liu-9538ba1a5/), [Personal Website](https://yilin.games/)
+* Tested on: Windows 10, i7-10750H @ 2.59GHz 32GB, GeForce RTX 2070 with Max-Q 8GB (Personal Laptop) 
 
-Include screenshots, analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+# Project 1 Results
+
+##  Screenshots:
+|![image](./images/1M_200Scale.gif)|
+|:--:| 
+| *1M boids with Uniform Coherent Grid* |
+
+
+|![image](./images/200k%20uniform%20grid.gif)|
+|:--:| 
+| *200k boids with Unifrom Grid* |
+
+
+## Analysis
+
+|![image](./images/naive_table.png)|
+|:--:| 
+| *Naive method* |
+
+|![image](./images/discrete_table.png)|
+|:--:| 
+| *Uniform Grid Scattered method* |
+
+|![image](./images/coherent_table.png)|
+|:--:| 
+| *Uniform Grid Coherent method* |
+
+|![image](./images/Framerate%20vs%20Methods.png)|
+|:--:| 
+| *Comparision of three methods* |
+
+|![image](./images/block_Size.png)|
+|:--:| 
+| *Effects of Block SIze* |
+
+## Questions:
+
+- For each implementation, how does changing the number of boids affect performance? Why do you think this is?
+  
+  **Answer**: 
+For the naïve implementation, increasing the number of boids will significantly affect performance. The complexity O(n^2) is exponential related to the number of boids since we need to calculate the distance between every boids
+For uniform grid methods, the effect is not that obvious as the naive implementation since we only calculate the distance between boids among 8 out of 27 grids (the effects will be lifted). 
+
+- For each implementation, how does changing the block count and block size affect performance? Why do you think this is?
+ 
+    **Answer**: For all the implementations, increasing the block size before it reaches 64 will increase the performance, However, while greater than 64, increasing the size will slightly reduce the performance. This might be explained by the idle thread wasted.
+  
+- For the coherent uniform grid: did you experience any performance improvements with the more coherent uniform grid? Was this the outcome you expected? Why or why not?
+ 
+    **Answer**: 
+    Coherent uniform grid's influence on performance improves as the number of boids increases. It can be explained by the increasing cost of data transfer for large number of boids. 
+
+- Did changing cell width and checking 27 vs 8 neighboring cells affect performance? Why or why not? Be careful: it is insufficient (and possibly incorrect) to say that 27-cell is slower simply because there are more cells to check!
+
+  **Answer**: 
+  Changing to 27 cells will significantly improve the performance. This effect is especially obvious when boids are super-dense in the grid, because simply searching through 27 cells may be more efficient than calculating the nearest neighborhoods. 
\ No newline at end of file
diff --git a/images/1M_200Scale.gif b/images/1M_200Scale.gif
new file mode 100644
index 0000000..280013d
Binary files /dev/null and b/images/1M_200Scale.gif differ
diff --git a/images/200k uniform grid.gif b/images/200k uniform grid.gif
new file mode 100644
index 0000000..9b017ea
Binary files /dev/null and b/images/200k uniform grid.gif differ
diff --git a/images/50k_uniform.gif b/images/50k_uniform.gif
new file mode 100644
index 0000000..77dbd3a
Binary files /dev/null and b/images/50k_uniform.gif differ
diff --git a/images/Framerate vs Methods.png b/images/Framerate vs Methods.png
new file mode 100644
index 0000000..b292254
Binary files /dev/null and b/images/Framerate vs Methods.png differ
diff --git a/images/block_Size.png b/images/block_Size.png
new file mode 100644
index 0000000..3d6d566
Binary files /dev/null and b/images/block_Size.png differ
diff --git a/images/coherent_table.png b/images/coherent_table.png
new file mode 100644
index 0000000..efb2120
Binary files /dev/null and b/images/coherent_table.png differ
diff --git a/images/discrete_table.png b/images/discrete_table.png
new file mode 100644
index 0000000..9124c19
Binary files /dev/null and b/images/discrete_table.png differ
diff --git a/images/naive_table.png b/images/naive_table.png
new file mode 100644
index 0000000..814130f
Binary files /dev/null and b/images/naive_table.png differ
diff --git a/src/kernel.cu b/src/kernel.cu
index 74dffcb..2018453 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -5,7 +5,7 @@
 #include <glm/glm.hpp>
 #include "utilityCore.hpp"
 #include "kernel.h"
-
+#include <device_LAUNCH_PARAMETERS.h>
 // LOOK-2.1 potentially useful for doing grid-based neighbor search
 #ifndef imax
 #define imax( a, b ) ( ((a) > (b)) ? (a) : (b) )
@@ -37,7 +37,7 @@ void checkCUDAError(const char *msg, int line = -1) {
 *****************/
 
 /*! Block size used for CUDA kernel launch. */
-#define blockSize 128
+#define blockSize 32
 
 // LOOK-1.2 Parameters for the boids algorithm.
 // These worked well in our reference implementation.
@@ -85,6 +85,8 @@ int *dev_gridCellEndIndices;   // to this cell?
 
 // TODO-2.3 - consider what additional buffers you might need to reshuffle
 // the position and velocity data to be coherent within cells.
+glm::vec3* dev_cohPos;
+glm::vec3* dev_cohVel;
 
 // LOOK-2.1 - Grid parameters based on simulation parameters.
 // These are automatically computed for you in Boids::initSimulation
@@ -133,6 +135,8 @@ __global__ void kernGenerateRandomPosArray(int time, int N, glm::vec3 * arr, flo
   }
 }
 
+
+
 /**
 * Initialize memory, update some globals
 */
@@ -168,7 +172,24 @@ void Boids::initSimulation(int N) {
   gridMinimum.y -= halfGridWidth;
   gridMinimum.z -= halfGridWidth;
 
-  // TODO-2.1 TODO-2.3 - Allocate additional buffers here.
+  // TODO-2.1 - Allocate additional buffers here.
+  cudaMalloc((void**)&dev_particleArrayIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleArrayIndices failed!");
+  cudaMalloc((void**)&dev_particleGridIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleGridIndices failed!");
+  cudaMalloc((void**)&dev_gridCellStartIndices, gridCellCount * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellStartIndices failed!");
+  cudaMalloc((void**)&dev_gridCellEndIndices, gridCellCount * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellEndIndices failed!");
+  dev_thrust_particleArrayIndices = thrust::device_ptr<int>(dev_particleArrayIndices);
+  dev_thrust_particleGridIndices = thrust::device_ptr<int>(dev_particleGridIndices);
+
+  // TODO-2.3 
+  cudaMalloc((void**)&dev_cohPos, N * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("cudaMalloc dev_cohPos failed!");
+  cudaMalloc((void**)&dev_cohVel, N * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("cudaMalloc dev_cohVel failed!");
+
   cudaDeviceSynchronize();
 }
 
@@ -230,21 +251,92 @@ void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities)
 * in the `pos` and `vel` arrays.
 */
 __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *pos, const glm::vec3 *vel) {
-  // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
-  // Rule 2: boids try to stay a distance d away from each other
-  // Rule 3: boids try to match the speed of surrounding boids
-  return glm::vec3(0.0f, 0.0f, 0.0f);
-}
+    // the new velocity 
+    glm::vec3 new_vel(0.0f, 0.0f, 0.0f);
+
+    glm::vec3 perceived_center(0.0f, 0.0f, 0.0f);
+    glm::vec3 c(0.0f, 0.0f, 0.0f);
+    glm::vec3 perceived_velocity(0.0f, 0.0f, 0.0f);
+
+    int number_of_neighbors1 = 0;
+    int number_of_neighbors2 = 0;
+    int number_of_neighbors3 = 0;
+
+    for (int i = 0; i < N; i++) {
+        // b.pos: pos [i]
+        // boid.pos: pos [iSelf]
+        if (i != iSelf) {
+
+            // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
+            float distance = glm::distance(pos[i], pos[iSelf]);
+            if (distance < rule1Distance) {
+                perceived_center += pos[i];
+                number_of_neighbors1++;
+            }
+
+            // Rule 2: boids try to stay a distance d away from each other
+            if (distance < rule2Distance) {
+                c -= pos[i] - pos[iSelf]; // add an expel vector
+                number_of_neighbors2++;
+            }
+
+            // Rule 3: boids try to match the speed of surrounding boids
+            if (distance < rule3Distance) {
+                perceived_velocity += vel[i];
+                number_of_neighbors3++;
+            }
+        }
+    }
 
+    // Rule 1 Update
+    if (number_of_neighbors1 > 0) {
+        perceived_center /= number_of_neighbors1;
+    }
+    // Rule 2 Update
+    if (number_of_neighbors2 > 0) {
+        c = c * rule2Scale;
+    }
+    // Rule 3 Update
+    if (number_of_neighbors3 > 0) {
+        perceived_velocity /= number_of_neighbors3;
+    }
+    new_vel = vel[iSelf] 
+        + (perceived_center - pos[iSelf]) * rule1Scale
+        + c 
+        + perceived_velocity * rule3Scale;
+  
+    return new_vel;
+}
+__device__ void clamp(glm::vec3& vel, float max) {
+    vel = glm::normalize(vel) * max;
+}
 /**
 * TODO-1.2 implement basic flocking
 * For each of the `N` bodies, update its position based on its current velocity.
 */
 __global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos,
-  glm::vec3 *vel1, glm::vec3 *vel2) {
-  // Compute a new velocity based on pos and vel1
-  // Clamp the speed
-  // Record the new velocity into vel2. Question: why NOT vel1?
+    glm::vec3 *vel1, glm::vec3 *vel2) {
+    // Compute a new velocity based on pos and vel1
+    // Clamp the speed
+    // Record the new velocity into vel2. Question: why NOT vel1?
+
+
+    // Ans: vel1 and vel2 are Ping-Pong buffers. Here vel1 is used to calculate new velocity. 
+    // Recording into vel2 will not affect the calculation. 
+    // However, if we record new velocity to vel1, other threads in this kernel may suffer from the data change. 
+
+    int index = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (index >= N) {
+        return; 
+    }
+
+    // Compute a new velocity based on pos and vel1
+    glm::vec3 new_vel = computeVelocityChange(N, index, pos, vel1);
+    //glm::vec3 new_dir = glm::normalize(new_vel);
+    
+    // Clamp the speed and record the new velocity into vel2
+    vel2[index] = glm::normalize(new_vel) * maxSpeed;
+
 }
 
 /**
@@ -281,7 +373,14 @@ __global__ void kernUpdatePos(int N, float dt, glm::vec3 *pos, glm::vec3 *vel) {
 __device__ int gridIndex3Dto1D(int x, int y, int z, int gridResolution) {
   return x + y * gridResolution + z * gridResolution * gridResolution;
 }
-
+__device__ int get1DCellIndex(int index, int gridResolution,
+    glm::vec3 gridMin, float inverseCellWidth, glm::vec3* pos)
+{
+    int iX = glm::floor((pos[index].x - gridMin.x) * inverseCellWidth);
+    int iY = glm::floor((pos[index].y - gridMin.y) * inverseCellWidth);
+    int iZ = glm::floor((pos[index].z - gridMin.z) * inverseCellWidth);
+    return gridIndex3Dto1D(iX, iY, iZ, gridResolution);
+}
 __global__ void kernComputeIndices(int N, int gridResolution,
   glm::vec3 gridMin, float inverseCellWidth,
   glm::vec3 *pos, int *indices, int *gridIndices) {
@@ -289,6 +388,14 @@ __global__ void kernComputeIndices(int N, int gridResolution,
     // - Label each boid with the index of its grid cell.
     // - Set up a parallel array of integer indices as pointers to the actual
     //   boid data in pos and vel1/vel2
+
+    int index = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (index >= N) {
+        return;
+    }
+    gridIndices[index] = get1DCellIndex(index, gridResolution, gridMin,
+        inverseCellWidth, pos);
+    indices[index] = index;
 }
 
 // LOOK-2.1 Consider how this could be useful for indicating that a cell
@@ -299,29 +406,132 @@ __global__ void kernResetIntBuffer(int N, int *intBuffer, int value) {
     intBuffer[index] = value;
   }
 }
+__global__ void kernShuffleData(int N, int* particleArrayIndices,
+    glm::vec3* pos, glm::vec3* coherentPos,
+    glm::vec3* vel, glm::vec3* coherentVel) {
+    int index = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (index >= N) {
+        return;
+    }
+    int pIndex = particleArrayIndices[index];
+    coherentPos[index] = pos[pIndex];
+    coherentVel[index] = vel[pIndex];
+
 
+}
 __global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices,
   int *gridCellStartIndices, int *gridCellEndIndices) {
   // TODO-2.1
   // Identify the start point of each cell in the gridIndices array.
   // This is basically a parallel unrolling of a loop that goes
   // "this index doesn't match the one before it, must be a new cell!"
+    int index = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (index >= N) {
+        return;
+    }
+
+    // the current grid index
+    int currIndex = particleGridIndices[index];
+
+    // the last grid index
+    int lastIndex = index > 0 ? particleGridIndices[index - 1] : -1;
+    
+    if (currIndex != lastIndex) {
+        gridCellStartIndices[currIndex] = index;
+        if (lastIndex != -1) {
+            gridCellEndIndices[lastIndex] = index - 1;
+        }
+    }
+
 }
 
 __global__ void kernUpdateVelNeighborSearchScattered(
-  int N, int gridResolution, glm::vec3 gridMin,
-  float inverseCellWidth, float cellWidth,
-  int *gridCellStartIndices, int *gridCellEndIndices,
-  int *particleArrayIndices,
-  glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) {
-  // TODO-2.1 - Update a boid's velocity using the uniform grid to reduce
-  // the number of boids that need to be checked.
-  // - Identify the grid cell that this particle is in
-  // - Identify which cells may contain neighbors. This isn't always 8.
-  // - For each cell, read the start/end indices in the boid pointer array.
-  // - Access each boid in the cell and compute velocity change from
-  //   the boids rules, if this boid is within the neighborhood distance.
-  // - Clamp the speed change before putting the new speed in vel2
+    int N, int gridResolution, glm::vec3 gridMin,
+    float inverseCellWidth, float cellWidth,
+    int* gridCellStartIndices, int* gridCellEndIndices,
+    int* particleArrayIndices,
+    glm::vec3* pos, glm::vec3* vel1, glm::vec3* vel2) {
+    // TODO-2.1 - Update a boid's velocity using the uniform grid to reduce
+    // the number of boids that need to be checked.
+    int maxCellIdx = gridResolution * gridResolution * gridResolution - 1;
+    int index = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (index >= N) {
+        return;
+    }
+    // - Identify the grid cell that this particle is in
+    glm::vec3 gridPos = (pos[index] - gridMin) * inverseCellWidth;
+    
+    // gridIndex is a 3D index, we will need to unroll it to 1D later 
+    glm::ivec3 gridIndex = glm::ivec3(round(gridPos.x), round(gridPos.y), round(gridPos.z));
+    
+    // parameters for later calculation
+    int number_of_neighbors1 = 0;
+    int number_of_neighbors2 = 0;
+    int number_of_neighbors3 = 0;
+    glm::vec3 perceived_center(0.0f, 0.0f, 0.0f);
+    glm::vec3 c(0.0f, 0.0f, 0.0f);
+    glm::vec3 perceived_velocity(0.0f, 0.0f, 0.0f);
+
+    for (int iZ = gridIndex.z - 1; iZ <= gridIndex.z; iZ++) {
+        for (int iY = gridIndex.y - 1; iY <= gridIndex.y; iY++) {
+            for (int iX = gridIndex.x - 1; iX <= gridIndex.x; iX++) {
+                // - Identify which cells may contain neighbors. This isn't always 8.
+                int cellIndex = gridIndex3Dto1D(iX, iY, iZ, gridResolution);
+                if (cellIndex < 0 || cellIndex > maxCellIdx)
+                    continue;
+                // - For each cell, read the start/end indices in the boid pointer array.
+                // the indices between start and end are boids inside this cell
+                int startIndex = gridCellStartIndices[cellIndex];
+                int endIndex = gridCellEndIndices[cellIndex];
+                if (startIndex < 0)
+                    continue;
+
+                // - Access each boid in the cell and compute velocity change from
+                //   the boids rules, if this boid is within the neighborhood distance.
+                for (int i = startIndex; i <= endIndex; i++) {
+                    int b = particleArrayIndices[i];
+                    // check not itself, index-boid
+                    if (b != index) {
+                        float distance = glm::distance(pos[b], pos[index]);
+                        // Rule 1: Boids try to fly towards the centre of mass of neighbouring boids
+                        if (distance < rule1Distance) {
+                            perceived_center += pos[b];
+                            number_of_neighbors1++;
+                        }
+                        // Rule 2: boids try to stay a distance d away from each other
+                        if (distance < rule2Distance) {
+                            c -= (pos[b] - pos[index]); // add an expel vector
+                            number_of_neighbors2++;
+                        }
+                        // Rule 3: boids try to match the speed of surrounding boids
+                        if (distance < rule3Distance) {
+                            perceived_velocity += vel1[b];
+                            number_of_neighbors3++;
+                        }
+                    }
+                   
+                }
+
+            }
+        }
+    }
+
+    // Rule 1 Update
+    if (number_of_neighbors1 > 0) {
+        perceived_center /= number_of_neighbors1;
+        perceived_center -= pos[index];
+    }
+    // Rule 3 Update
+    if (number_of_neighbors3 > 0) {
+        perceived_velocity /= number_of_neighbors3;
+    }
+    glm::vec3 new_vel = vel1[index]
+        + (perceived_center ) * rule1Scale
+        + c * rule2Scale
+        + perceived_velocity * rule3Scale;
+    // - Clamp the speed change before putting the new speed in vel2
+    vel2[index] = glm::normalize(new_vel) * maxSpeed;
+
 }
 
 __global__ void kernUpdateVelNeighborSearchCoherent(
@@ -333,55 +543,217 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
   // except with one less level of indirection.
   // This should expect gridCellStartIndices and gridCellEndIndices to refer
   // directly to pos and vel1.
-  // - Identify the grid cell that this particle is in
-  // - Identify which cells may contain neighbors. This isn't always 8.
-  // - For each cell, read the start/end indices in the boid pointer array.
-  //   DIFFERENCE: For best results, consider what order the cells should be
-  //   checked in to maximize the memory benefits of reordering the boids data.
-  // - Access each boid in the cell and compute velocity change from
-  //   the boids rules, if this boid is within the neighborhood distance.
-  // - Clamp the speed change before putting the new speed in vel2
+    int maxCellIdx = gridResolution * gridResolution * gridResolution - 1;
+    int index = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (index >= N) {
+        return;
+    }
+     // - Identify the grid cell that this particle is in
+    glm::vec3 gridPos = (pos[index] - gridMin) * inverseCellWidth;
+    // gridIndex is a 3D index, we will need to unroll it to 1D later 
+    glm::ivec3 gridIndex = glm::ivec3(round(gridPos.x), round(gridPos.y), round(gridPos.z));
+    glm::ivec3 gridIndex3D = glm::floor((pos[index] - glm::vec3(cellWidth / 2) - gridMin) * inverseCellWidth);
+
+    // parameters for later calculation
+    int number_of_neighbors1 = 0;
+    int number_of_neighbors3 = 0;
+    glm::vec3 perceived_center = glm::vec3(0.0f, 0.0f, 0.0f);
+    glm::vec3 c = glm::vec3(0.0f, 0.0f, 0.0f);
+    glm::vec3 perceived_velocity = glm::vec3(0.0f, 0.0f, 0.0f);
+
+
+    for (int iZ = gridIndex3D.z; iZ <= gridIndex3D.z + 1; iZ++) {
+        for (int iY = gridIndex3D.y ; iY <= gridIndex3D.y + 1; iY++) {
+            for (int iX = gridIndex3D.x; iX <= gridIndex3D.x + 1; iX++) {
+                // - Identify which cells may contain neighbors. This isn't always 8.
+                int cellIndex = gridIndex3Dto1D(iX, iY, iZ, gridResolution);
+                if (iX < 0 || iY < 0 || iZ < 0 || iX >= gridResolution || iY >= gridResolution || iZ >= gridResolution)
+                {
+                    continue;
+                }
+                if (cellIndex < 0 || cellIndex > maxCellIdx)
+                    continue;
+                // - For each cell, read the start/end indices in the boid pointer array.
+                //   DIFFERENCE: For best results, consider what order the cells should be
+                //   checked in to maximize the memory benefits of reordering the boids data.
+                int startIndex = gridCellStartIndices[cellIndex];
+                int endIndex = gridCellEndIndices[cellIndex];
+                if (startIndex < 0)
+                    continue;
+
+                // - Access each boid in the cell and compute velocity change from
+                //   the boids rules, if this boid is within the neighborhood distance.
+                for (int i = startIndex; i <= endIndex; i++) {
+                    // check not itself, index-boid, i-b
+                    if (i != index) {
+                        float distance = glm::distance(pos[i], pos[index]);
+                        // Rule 1: Boids try to fly towards the centre of mass of neighbouring boids
+                        if (distance < rule1Distance) {
+                            perceived_center += pos[i];
+                            number_of_neighbors1++;
+                        }
+                        // Rule 2: boids try to stay a distance d away from each other
+                        if (distance < rule2Distance) {
+                            c -= (pos[i] - pos[index]); // add an expel vector
+                        }
+                        // Rule 3: boids try to match the speed of surrounding boids
+                        if (distance < rule3Distance) {
+                            perceived_velocity += vel1[i];
+                            number_of_neighbors3++;
+                        }
+                    }
+                }
+
+            }
+        }
+    }
+
+    // Rule 1 Update
+    if (number_of_neighbors1 > 0) {
+        perceived_center /= (float)number_of_neighbors1;
+        //perceived_center -= pos[index];
+    }
+    // Rule 3 Update
+    if (number_of_neighbors3 > 0) {
+        perceived_velocity /= (float)number_of_neighbors3;
+    }
+    glm::vec3 new_vel = vel1[index]
+        + (perceived_center - pos[index]) * rule1Scale
+        + c * rule2Scale
+        + perceived_velocity * rule3Scale;
+    // - Clamp the speed change before putting the new speed in vel2
+    //vel2[index] = glm::normalize(new_vel) * maxSpeed;
+    //glm::vec3 combined = perceived_center * rule1Scale + c * rule2Scale + perceived_velocity * rule3Scale;
+    //glm::vec3 newVel = vel1[index] + combined;
+    if (glm::length(new_vel) > maxSpeed)
+    {
+        vel2[index] = glm::normalize(new_vel) * maxSpeed;
+    }
+    else {
+        vel2[index] = new_vel;
+    }
 }
 
 /**
 * Step the entire N-body simulation by `dt` seconds.
 */
 void Boids::stepSimulationNaive(float dt) {
-  // TODO-1.2 - use the kernels you wrote to step the simulation forward in time.
-  // TODO-1.2 ping-pong the velocity buffers
+    // TODO-1.2 - use the kernels you wrote to step the simulation forward in time.
+    dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+
+    kernUpdateVelocityBruteForce << <fullBlocksPerGrid, threadsPerBlock >> > (numObjects, dev_pos, dev_vel1, dev_vel2);
+    checkCUDAErrorWithLine("kernUpdateVelocityBruteForce failed!");
+
+
+    kernUpdatePos << <fullBlocksPerGrid, blockSize >> > (numObjects, dt,
+        dev_pos, dev_vel1);
+    checkCUDAErrorWithLine("kernUpdatePos failed!");
+
+    // TODO-1.2 ping-pong the velocity buffers
+    std::swap(dev_vel1, dev_vel2);
 }
 
+
+
 void Boids::stepSimulationScatteredGrid(float dt) {
   // TODO-2.1
   // Uniform Grid Neighbor search using Thrust sort.
   // In Parallel:
+    dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+    dim3 gridCellThreadNum((gridCellCount + blockSize - 1) / blockSize);
+
+    kernResetIntBuffer << <gridCellThreadNum, blockSize >> > (gridCellCount, dev_gridCellStartIndices, -1);
+    kernResetIntBuffer << <gridCellThreadNum, blockSize >> > (gridCellCount, dev_gridCellEndIndices, numObjects - 1);
   // - label each particle with its array index as well as its grid index.
-  //   Use 2x width grids.
+  //   Use 2x width grids
+    kernComputeIndices <<<fullBlocksPerGrid, blockSize >>> (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+    checkCUDAErrorWithLine("kernComputeIndices failed!");
+    cudaDeviceSynchronize();
+
+
   // - Unstable key sort using Thrust. A stable sort isn't necessary, but you
   //   are welcome to do a performance comparison.
+  //   sort: (dev_keys, dev_keys + N, dev_values)
+    thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
+    cudaDeviceSynchronize();
+
   // - Naively unroll the loop for finding the start and end indices of each
   //   cell's data pointers in the array of boid indices
-  // - Perform velocity updates using neighbor search
-  // - Update positions
+    kernIdentifyCellStartEnd <<< fullBlocksPerGrid, blockSize >>> (numObjects, dev_particleGridIndices,
+        dev_gridCellStartIndices, dev_gridCellEndIndices);
+    cudaDeviceSynchronize();
+
+    // - Perform velocity updates using neighbor search
+    kernUpdateVelNeighborSearchScattered <<<fullBlocksPerGrid, blockSize >>> (
+        numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth, dev_gridCellStartIndices,
+        dev_gridCellEndIndices, dev_particleArrayIndices, dev_pos, dev_vel1, dev_vel2);
+    cudaDeviceSynchronize();
+
+  // - Update positions with vel2
+    kernUpdatePos <<<fullBlocksPerGrid, blockSize >>> (numObjects, dt, dev_pos, dev_vel2);
+    cudaDeviceSynchronize();
+
   // - Ping-pong buffers as needed
+  // vel1: current vel; vel2: next vel;
+    std::swap(dev_vel1, dev_vel2);
+
+
 }
 
 void Boids::stepSimulationCoherentGrid(float dt) {
   // TODO-2.3 - start by copying Boids::stepSimulationNaiveGrid
   // Uniform Grid Neighbor search using Thrust sort on cell-coherent data.
   // In Parallel:
+    dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+    dim3 gridCellThreadNum((gridCellCount + blockSize - 1) / blockSize);
+
+    kernResetIntBuffer << <fullBlocksPerGrid, blockSize >> > (gridCellCount, dev_gridCellStartIndices, -1);
+    kernResetIntBuffer << <fullBlocksPerGrid, blockSize >> > (gridCellCount, dev_gridCellEndIndices, numObjects - 1);
+
   // - Label each particle with its array index as well as its grid index.
   //   Use 2x width grids
+    kernComputeIndices << <fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+    checkCUDAErrorWithLine("kernComputeIndices failed!");
+    cudaDeviceSynchronize();
+
+    //dev_thrust_particleArrayIndices = thrust::device_pointer_cast(dev_particleArrayIndices);
+    //dev_thrust_particleGridIndices = thrust::device_pointer_cast(dev_particleGridIndices);
   // - Unstable key sort using Thrust. A stable sort isn't necessary, but you
   //   are welcome to do a performance comparison.
+    thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
+    cudaDeviceSynchronize();
+
   // - Naively unroll the loop for finding the start and end indices of each
   //   cell's data pointers in the array of boid indices
+    kernIdentifyCellStartEnd << < fullBlocksPerGrid, blockSize >> > (numObjects, dev_particleGridIndices,
+        dev_gridCellStartIndices, dev_gridCellEndIndices);
+    checkCUDAErrorWithLine("kernIdentifyCellStartEnd failed!");
+
+    cudaDeviceSynchronize();
   // - BIG DIFFERENCE: use the rearranged array index buffer to reshuffle all
   //   the particle data in the simulation array.
   //   CONSIDER WHAT ADDITIONAL BUFFERS YOU NEED
-  // - Perform velocity updates using neighbor search
-  // - Update positions
+    kernShuffleData<<<fullBlocksPerGrid, blockSize>>>(numObjects, dev_particleArrayIndices, dev_pos, dev_cohPos,
+        dev_vel1, dev_cohVel);
+    checkCUDAErrorWithLine("kernShuffleData failed!");
+    cudaDeviceSynchronize();
+
+  // - Perform velocity updates using neighbor search, update vel to vel2
+    kernUpdateVelNeighborSearchCoherent << <fullBlocksPerGrid, blockSize >> > (
+        numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth, dev_gridCellStartIndices,
+        dev_gridCellEndIndices, dev_cohPos, dev_cohVel, dev_vel2);
+    checkCUDAErrorWithLine("kernUpdateVelNeighborSearchCoherent failed!");
+    cudaDeviceSynchronize();
+
+  // - Update positions with vel2
+    kernUpdatePos <<<fullBlocksPerGrid, blockSize >>> (numObjects, dt, dev_cohPos, dev_vel2);
+    checkCUDAErrorWithLine("kernUpdatePos failed!");
+    cudaDeviceSynchronize();
+
   // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE.
+    std::swap(dev_vel1, dev_vel2);
+    std::swap(dev_cohPos, dev_pos);
+
 }
 
 void Boids::endSimulation() {
@@ -390,6 +762,15 @@ void Boids::endSimulation() {
   cudaFree(dev_pos);
 
   // TODO-2.1 TODO-2.3 - Free any additional buffers here.
+
+  cudaFree(dev_particleArrayIndices);
+  cudaFree(dev_particleGridIndices);
+  cudaFree(dev_gridCellStartIndices);
+  cudaFree(dev_gridCellEndIndices);
+
+  cudaFree(dev_cohPos);
+  cudaFree(dev_cohVel);
+
 }
 
 void Boids::unitTest() {
diff --git a/src/main.cpp b/src/main.cpp
index b82c8c6..8a53937 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -14,11 +14,11 @@
 
 // LOOK-2.1 LOOK-2.3 - toggles for UNIFORM_GRID and COHERENT_GRID
 #define VISUALIZE 1
-#define UNIFORM_GRID 0
-#define COHERENT_GRID 0
+#define UNIFORM_GRID 1
+#define COHERENT_GRID 1
 
 // LOOK-1.2 - change this to adjust particle count in the simulation
-const int N_FOR_VIS = 5000;
+const int N_FOR_VIS = 20000;
 const float DT = 0.2f;
 
 /**