diff --git a/README.md b/README.md
index d63a6a1..5454d83 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,55 @@
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture,
 Project 1 - Flocking**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Edward Zhang
+  * https://www.linkedin.com/in/edwardjczhang/
+  * https://zedward23.github.io/personal_Website/
+ 
+* Tested on: Windows 10 Home, i7-11800H @ 2.3GHz, 16.0GB, NVIDIA GeForce RTX 3060 Laptop GPU
 
-### (TODO: Your README)
+### ReadMe
+
+Demos:
+
+Naive
+
+![](images/Naive.gif)
+
+Scattered Uniform Grid
+
+![](images/ScatterGrid.gif)
+
+Coherent Uniform Grid
+
+![](images/CoherentGrid.gif)
+
+* For each implementation, how does changing the number of boids affect
+performance? Why do you think this is?
+![](images/BoidsVFPS.png)
+
+Increasing the number of boids would decreased performance across all implementation types. I think this is because the raw number of computations simply increased as we added more boids since each additional boid meant an additions N+1 calculations per frame.
+
+* For each implementation, how does changing the block count and block size
+affect performance? Why do you think this is?
+
+![](images/BlockCountVFPS1.png)
+
+BlockCount is inversely correlated with the length of the GridCellWidth: the larger the width of the cell, the less blocks can fit in the total grid, meaning that block count will be lower. We observe that performance worsens as GridCellWidth increases; I believe this is because increasing gridCellWidth will eventually have us converge back upon the Naive implementation since we'd be comparing against all the boids in the grid.
+
+![](images/BlockSizeVFPS.png)
+Until block size reaches a minimum threshold, there simply are not enough threads available to effectly do any work in parallel. Thus, the advantages of parallelism is lost and more of the work is forced to be completed in a series, thus worsening performance. However, past that threshold, performance stables then gently tapers off. This demonstrates that there is likely an optimal blocksize to maximize performance.
+
+
+* For the coherent uniform grid: did you experience any performance improvements
+with the more coherent uniform grid? Was this the outcome you expected?
+Why or why not?
+I did not see a performance increase; I believe this is because I probably didn't do the sorting aspect of this implementation properly in parallel, meaning that the computational legroom I made for myself by reducing the amount of queries I was making was offset by the sorting action that I had to do. If this had been done correctly, I believe that true removal of need to access global information should have increased the performance of this program.
+
+* Did changing cell width and checking 27 vs 8 neighboring cells affect performance?
+Why or why not? Be careful: it is insufficient (and possibly incorrect) to say
+that 27-cell is slower simply because there are more cells to check!
+
+It did not alter frame rate that much; I believe this is because while we are checking more grids for neighbors, those grid cells ultimately have less boids in them. Thus there is a trade off between checking more grids and comparing with less voids.
 
 Include screenshots, analysis, etc. (Remember, this is public, so don't put
 anything here that you don't want to share with the world.)
diff --git a/images/BlockCountVFPS.png b/images/BlockCountVFPS.png
new file mode 100644
index 0000000..1509374
Binary files /dev/null and b/images/BlockCountVFPS.png differ
diff --git a/images/BlockCountVFPS1.png b/images/BlockCountVFPS1.png
new file mode 100644
index 0000000..1509374
Binary files /dev/null and b/images/BlockCountVFPS1.png differ
diff --git a/images/BlockSizeVFPS.png b/images/BlockSizeVFPS.png
new file mode 100644
index 0000000..75c06a9
Binary files /dev/null and b/images/BlockSizeVFPS.png differ
diff --git a/images/BoidsVFPS.png b/images/BoidsVFPS.png
new file mode 100644
index 0000000..82bdd4a
Binary files /dev/null and b/images/BoidsVFPS.png differ
diff --git a/images/CoherentGrid.gif b/images/CoherentGrid.gif
new file mode 100644
index 0000000..5ab818e
Binary files /dev/null and b/images/CoherentGrid.gif differ
diff --git a/images/Naive.gif b/images/Naive.gif
new file mode 100644
index 0000000..69c216b
Binary files /dev/null and b/images/Naive.gif differ
diff --git a/images/ScatterGrid.gif b/images/ScatterGrid.gif
new file mode 100644
index 0000000..ea0b4d8
Binary files /dev/null and b/images/ScatterGrid.gif differ
diff --git a/src/kernel.cu b/src/kernel.cu
index 74dffcb..441bfed 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -37,7 +37,7 @@ void checkCUDAError(const char *msg, int line = -1) {
 *****************/
 
 /*! Block size used for CUDA kernel launch. */
-#define blockSize 128
+#define blockSize 8
 
 // LOOK-1.2 Parameters for the boids algorithm.
 // These worked well in our reference implementation.
@@ -85,6 +85,9 @@ int *dev_gridCellEndIndices;   // to this cell?
 
 // TODO-2.3 - consider what additional buffers you might need to reshuffle
 // the position and velocity data to be coherent within cells.
+glm::vec3* dev_sorted_pos;
+glm::vec3* dev_sorted_vel;
+
 
 // LOOK-2.1 - Grid parameters based on simulation parameters.
 // These are automatically computed for you in Boids::initSimulation
@@ -169,6 +172,28 @@ void Boids::initSimulation(int N) {
   gridMinimum.z -= halfGridWidth;
 
   // TODO-2.1 TODO-2.3 - Allocate additional buffers here.
+
+  cudaMalloc((void**)&dev_particleArrayIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleArrayIndices failed!");
+
+  cudaMalloc((void**)&dev_gridCellStartIndices, gridCellCount * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellStartIndices failed!");
+
+  cudaMalloc((void**)&dev_gridCellEndIndices, gridCellCount * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellEndIndices failed!");
+
+  cudaMalloc((void**)&dev_particleGridIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleGridIndices failed!");
+
+  dev_thrust_particleGridIndices = thrust::device_ptr<int>(dev_particleGridIndices);
+  dev_thrust_particleArrayIndices = thrust::device_ptr<int>(dev_particleArrayIndices);
+
+  cudaMalloc((void**)&dev_sorted_pos, N * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("cudaMalloc dev_sorted_pos failed!");
+
+  cudaMalloc((void**)&dev_sorted_vel, N * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("cudaMalloc dev_sorted_vel failed!");
+
   cudaDeviceSynchronize();
 }
 
@@ -210,8 +235,8 @@ __global__ void kernCopyVelocitiesToVBO(int N, glm::vec3 *vel, float *vbo, float
 void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities) {
   dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
 
-  kernCopyPositionsToVBO << <fullBlocksPerGrid, blockSize >> >(numObjects, dev_pos, vbodptr_positions, scene_scale);
-  kernCopyVelocitiesToVBO << <fullBlocksPerGrid, blockSize >> >(numObjects, dev_vel1, vbodptr_velocities, scene_scale);
+  kernCopyPositionsToVBO <<<fullBlocksPerGrid, blockSize >>>(numObjects, dev_pos, vbodptr_positions, scene_scale);
+  kernCopyVelocitiesToVBO <<<fullBlocksPerGrid, blockSize >>>(numObjects, dev_vel1, vbodptr_velocities, scene_scale);
 
   checkCUDAErrorWithLine("copyBoidsToVBO failed!");
 
@@ -224,16 +249,54 @@ void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities)
 ******************/
 
 /**
-* LOOK-1.2 You can use this as a helper for kernUpdateVelocityBruteForce.
+* TODO-1.2 You can use this as a helper for kernUpdateVelocityBruteForce.
 * __device__ code can be called from a __global__ context
 * Compute the new velocity on the body with index `iSelf` due to the `N` boids
 * in the `pos` and `vel` arrays.
 */
 __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *pos, const glm::vec3 *vel) {
   // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
+    glm::vec3 perceivedCenter = glm::vec3(0, 0, 0);
+    glm::vec3 c = glm::vec3(0, 0, 0);
+    glm::vec3 perceivedVelocity = glm::vec3(0, 0, 0);
+
+    int numOfNeighborsP = 0;
+    int numOfNeighborsV = 0;
+    for (int i = 0; i < N; i++) {
+        if (i != iSelf) {
+            auto dist = glm::distance(pos[i], pos[iSelf]);
+            if (dist < rule1Distance) {
+                perceivedCenter += pos[i];
+                numOfNeighborsP++;
+            }
+            if (dist < rule2Distance) {
+                c -= pos[i] - pos[iSelf];
+            }
+            if (dist < rule3Distance) {
+                perceivedVelocity += vel[i];
+                numOfNeighborsV++;
+            }
+        }
+    }
+    if (numOfNeighborsP != 0) {
+        perceivedCenter /= numOfNeighborsP;
+    }
+
+    if (numOfNeighborsV != 0) {
+        perceivedVelocity /= numOfNeighborsV;
+    }
+    
+    glm::vec3 rule1Vel = (perceivedCenter - pos[iSelf]) * rule1Scale;
+    glm::vec3 rule2Vel = c * rule2Scale;
+    glm::vec3 rule3Vel = perceivedVelocity * rule3Scale;
+
   // Rule 2: boids try to stay a distance d away from each other
   // Rule 3: boids try to match the speed of surrounding boids
-  return glm::vec3(0.0f, 0.0f, 0.0f);
+    
+    auto totalVel = vel[iSelf] + rule1Vel + rule2Vel + rule3Vel;
+
+    return (glm::length(totalVel) <= maxSpeed ? totalVel : glm::normalize(totalVel) * maxSpeed);
+  //return glm::vec3(0.1f, 0.0f, 0.0f);
 }
 
 /**
@@ -245,6 +308,11 @@ __global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos,
   // Compute a new velocity based on pos and vel1
   // Clamp the speed
   // Record the new velocity into vel2. Question: why NOT vel1?
+    int index = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (index >= N) {
+        return;
+    }
+    vel2[index] = computeVelocityChange(N, index, pos, vel1);
 }
 
 /**
@@ -272,6 +340,49 @@ __global__ void kernUpdatePos(int N, float dt, glm::vec3 *pos, glm::vec3 *vel) {
   pos[index] = thisPos;
 }
 
+__device__ glm::vec3 computeVelocityChangeUsingGrid(int index, int start, int end, int* indices, const glm::vec3* pos, const glm::vec3* vel) {
+    // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
+    glm::vec3 perceivedCenter = glm::vec3(0,0,0);
+    glm::vec3 c = glm::vec3(0, 0, 0);
+    glm::vec3 perceivedVelocity = glm::vec3(0, 0, 0);
+
+    int numOfNeighborsP = 0;
+    int numOfNeighborsV = 0;
+    
+    for (int i = start; i <= end; i++) {
+        if (indices[i] != index) {
+            auto dist = glm::distance(pos[i], pos[index]);
+            if (dist < rule1Distance) {
+                perceivedCenter += pos[i];
+                numOfNeighborsP++;
+            }
+            if (dist < rule2Distance) {
+                c -= pos[i] - pos[index];
+            }
+            if (dist < rule3Distance) {
+                perceivedVelocity += vel[i];
+                numOfNeighborsV++;
+            }
+        }
+    }
+
+    if (numOfNeighborsP != 0) {
+        perceivedCenter /= numOfNeighborsP;
+    }
+
+    if (numOfNeighborsV != 0) {
+        perceivedVelocity /= numOfNeighborsV;
+    }
+
+    glm::vec3 rule1Vel = (perceivedCenter - pos[index]) * rule1Scale;
+    glm::vec3 rule2Vel = c * rule2Scale;
+    glm::vec3 rule3Vel = perceivedVelocity * rule3Scale;
+
+    auto totalVel = vel[index] + rule2Vel + rule3Vel; //rule1Vel
+
+    return (glm::length(totalVel) <= maxSpeed ? totalVel : glm::normalize(totalVel) * maxSpeed);
+}
+
 // LOOK-2.1 Consider this method of computing a 1D index from a 3D grid index.
 // LOOK-2.3 Looking at this method, what would be the most memory efficient
 //          order for iterating over neighboring grid cells?
@@ -288,11 +399,26 @@ __global__ void kernComputeIndices(int N, int gridResolution,
     // TODO-2.1
     // - Label each boid with the index of its grid cell.
     // - Set up a parallel array of integer indices as pointers to the actual
-    //   boid data in pos and vel1/vel2
+    //   boid data in pos and vel1/vel2 ?? what does this mean
+
+    int index = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (index >= N) {
+        return;
+    }
+
+    glm::vec3 currPos = pos[index];
+
+    gridIndices[index] = gridIndex3Dto1D(std::floor((currPos.x - gridMin.x) * inverseCellWidth),
+                                         std::floor((currPos.y - gridMin.y) * inverseCellWidth),
+                                         std::floor((currPos.z - gridMin.z) * inverseCellWidth),
+                                         gridResolution);
+
+    indices[index] = index; 
 }
 
 // LOOK-2.1 Consider how this could be useful for indicating that a cell
 //          does not enclose any boids
+// <<Pretty sure that we want to call this thrice for vel1, vel2, and pos>>
 __global__ void kernResetIntBuffer(int N, int *intBuffer, int value) {
   int index = (blockIdx.x * blockDim.x) + threadIdx.x;
   if (index < N) {
@@ -306,6 +432,42 @@ __global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices,
   // Identify the start point of each cell in the gridIndices array.
   // This is basically a parallel unrolling of a loop that goes
   // "this index doesn't match the one before it, must be a new cell!"
+    
+    //ParticleGridIndices should hopefully be sorted ?? 
+    //
+    //particleGridIndice[4] = the grid index of 4th index boid
+
+    int index = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (index >= N) {
+        return;
+    }
+
+    //Account for array out of bounds
+    if (index == N-1){
+        gridCellEndIndices[particleGridIndices[index]] = index;
+        if (particleGridIndices[index] != particleGridIndices[index - 1]) {
+            gridCellStartIndices[particleGridIndices[index]] = index;
+        }
+    }
+    else if (index == 0) {
+        gridCellStartIndices[particleGridIndices[index]] = index;
+        if (particleGridIndices[index] != particleGridIndices[index + 1]) {
+            gridCellEndIndices[particleGridIndices[index]] = index;
+        }
+    }
+    else {
+        if (particleGridIndices[index] != particleGridIndices[index - 1]) {
+            gridCellStartIndices[particleGridIndices[index]] = index;
+        }
+        if (particleGridIndices[index] != particleGridIndices[index + 1]) {
+            gridCellEndIndices[particleGridIndices[index]] = index;
+        }
+    }
+    //Out of bounds babyy
+    
+
+
+  //gridStart and gridEnd should be the same length; each pair represents the start of a new block
 }
 
 __global__ void kernUpdateVelNeighborSearchScattered(
@@ -317,11 +479,98 @@ __global__ void kernUpdateVelNeighborSearchScattered(
   // TODO-2.1 - Update a boid's velocity using the uniform grid to reduce
   // the number of boids that need to be checked.
   // - Identify the grid cell that this particle is in
+  
   // - Identify which cells may contain neighbors. This isn't always 8.
-  // - For each cell, read the start/end indices in the boid pointer array.
+  // - For each cell, read the start/end indices in the boid pointer array. <<
   // - Access each boid in the cell and compute velocity change from
   //   the boids rules, if this boid is within the neighborhood distance.
   // - Clamp the speed change before putting the new speed in vel2
+    int index = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (index >= N) {
+        return;
+    }
+
+
+    glm::vec3 currPos = pos[index];
+
+    float radius = imax(rule1Distance, imax(rule2Distance, rule3Distance));
+
+    float maxX = std::floor(((currPos.x + radius)-gridMin.x) * inverseCellWidth);
+    float maxY = std::floor(((currPos.y + radius)-gridMin.y) * inverseCellWidth);
+    float maxZ = std::floor(((currPos.z + radius)-gridMin.z) * inverseCellWidth);
+    float minX = std::floor(((currPos.x - radius)-gridMin.x) * inverseCellWidth);
+    float minY = std::floor(((currPos.y - radius)-gridMin.y) * inverseCellWidth);
+    float minZ = std::floor(((currPos.z - radius)-gridMin.z) * inverseCellWidth);
+
+    int gridIdx = gridIndex3Dto1D(std::floor((currPos.x - gridMin.x) * inverseCellWidth),
+                               std::floor((currPos.y - gridMin.y) * inverseCellWidth),
+                               std::floor((currPos.z - gridMin.z) * inverseCellWidth),
+                               gridResolution);
+    
+    glm::vec3 perceivedCenter = glm::vec3(0,0,0);
+    glm::vec3 c = glm::vec3(0, 0, 0);
+    glm::vec3 perceivedVelocity = glm::vec3(0, 0, 0);
+
+    int numOfNeighborsP = 0;
+    int numOfNeighborsV = 0;
+
+    for (int i = minX; i <= maxX; i++) {
+        for (int j = minY; j <= maxY; j++) {
+            for (int k = minZ; k <= maxZ; k++) {
+                int idx = gridIndex3Dto1D(i, j, k, gridResolution);
+                int start = gridCellStartIndices[idx];
+                int end = gridCellEndIndices[idx];
+
+                for (int l = start; l <= end; l++) {
+                    if (particleArrayIndices[l] != index) {
+                        auto dist = glm::distance(pos[particleArrayIndices[l]], pos[index]);
+                        if (dist < rule1Distance) {
+                            perceivedCenter += pos[particleArrayIndices[l]];
+                            numOfNeighborsP++;
+                        }
+                        if (dist < rule2Distance) {
+                            c -= pos[particleArrayIndices[l]] - pos[index];
+                        }
+                        if (dist < rule3Distance) {
+                            perceivedVelocity += vel1[particleArrayIndices[l]];
+                            numOfNeighborsV++;
+                        }
+                    }
+                }
+
+            }
+        }
+    }
+
+    if (numOfNeighborsP != 0) {
+        perceivedCenter /= numOfNeighborsP;
+    }
+
+    if (numOfNeighborsV != 0) {
+        perceivedVelocity /= numOfNeighborsV;
+    }
+
+    glm::vec3 rule1Vel = (perceivedCenter - pos[index]) * rule1Scale;
+    glm::vec3 rule2Vel = c * rule2Scale;
+    glm::vec3 rule3Vel = perceivedVelocity * rule3Scale;
+
+    auto totalVel = vel1[index] + rule1Vel + rule2Vel + rule3Vel; //rule1Vel
+
+    vel2[index] = (glm::length(totalVel) <= maxSpeed ? totalVel : glm::normalize(totalVel) * maxSpeed);
+    //float length = glm::length(computeVelocityChangeUsingGrid(index, start, end, particleArrayIndices, pos, vel1));
+    
+
+    
+}
+
+__global__ void kernFillSortedArray(int N, glm::vec3* toBeSorted, glm::vec3* sorted, int* indices) {
+    int index = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (index >= N) {
+        return;
+    }
+
+    sorted[index] = toBeSorted[indices[index]];
+
 }
 
 __global__ void kernUpdateVelNeighborSearchCoherent(
@@ -341,6 +590,79 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
   // - Access each boid in the cell and compute velocity change from
   //   the boids rules, if this boid is within the neighborhood distance.
   // - Clamp the speed change before putting the new speed in vel2
+    int index = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (index >= N) {
+        return;
+    }
+
+
+    glm::vec3 currPos = pos[index];
+
+    float radius = imax(rule1Distance, imax(rule2Distance, rule3Distance));
+
+    float maxX = std::floor(((currPos.x + radius) - gridMin.x) * inverseCellWidth);
+    float maxY = std::floor(((currPos.y + radius) - gridMin.y) * inverseCellWidth);
+    float maxZ = std::floor(((currPos.z + radius) - gridMin.z) * inverseCellWidth);
+    float minX = std::floor(((currPos.x - radius) - gridMin.x) * inverseCellWidth);
+    float minY = std::floor(((currPos.y - radius) - gridMin.y) * inverseCellWidth);
+    float minZ = std::floor(((currPos.z - radius) - gridMin.z) * inverseCellWidth);
+
+    int gridIdx = gridIndex3Dto1D(std::floor((currPos.x - gridMin.x) * inverseCellWidth),
+        std::floor((currPos.y - gridMin.y) * inverseCellWidth),
+        std::floor((currPos.z - gridMin.z) * inverseCellWidth),
+        gridResolution);
+
+    glm::vec3 perceivedCenter = glm::vec3(0, 0, 0);
+    glm::vec3 c = glm::vec3(0, 0, 0);
+    glm::vec3 perceivedVelocity = glm::vec3(0, 0, 0);
+
+    int numOfNeighborsP = 0;
+    int numOfNeighborsV = 0;
+
+    for (int i = minX; i <= maxX; i++) {
+        for (int j = minY; j <= maxY; j++) {
+            for (int k = minZ; k <= maxZ; k++) {
+                int idx = gridIndex3Dto1D(i, j, k, gridResolution);
+                int start = gridCellStartIndices[idx];
+                int end = gridCellEndIndices[idx];
+
+                for (int l = start; l <= end; l++) {
+                    if (l != index) {
+                        auto dist = glm::distance(pos[l], pos[index]);
+                        if (dist < rule1Distance) {
+                            perceivedCenter += pos[l];
+                            numOfNeighborsP++;
+                        }
+                        if (dist < rule2Distance) {
+                            c -= pos[l] - pos[index];
+                        }
+                        if (dist < rule3Distance) {
+                            perceivedVelocity += vel1[l];
+                            numOfNeighborsV++;
+                        }
+                    }
+                }
+
+            }
+        }
+    }
+
+    if (numOfNeighborsP != 0) {
+        perceivedCenter /= numOfNeighborsP;
+    }
+
+    if (numOfNeighborsV != 0) {
+        perceivedVelocity /= numOfNeighborsV;
+    }
+
+    glm::vec3 rule1Vel = (perceivedCenter - pos[index]) * rule1Scale;
+    glm::vec3 rule2Vel = c * rule2Scale;
+    glm::vec3 rule3Vel = perceivedVelocity * rule3Scale;
+
+    auto totalVel = vel1[index] + rule1Vel + rule2Vel + rule3Vel; //rule1Vel
+
+    vel2[index] = (glm::length(totalVel) <= maxSpeed ? totalVel : glm::normalize(totalVel) * maxSpeed);
+
 }
 
 /**
@@ -348,22 +670,34 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
 */
 void Boids::stepSimulationNaive(float dt) {
   // TODO-1.2 - use the kernels you wrote to step the simulation forward in time.
-  // TODO-1.2 ping-pong the velocity buffers
+    dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+    kernUpdateVelocityBruteForce <<<fullBlocksPerGrid, blockSize >>>(numObjects, dev_pos, dev_vel1, dev_vel2);
+    kernUpdatePos <<<fullBlocksPerGrid, blockSize >>> (numObjects, dt, dev_pos, dev_vel1);
+    
+  // TODO-1.2 ping-pong the velocity 
+    std::swap(dev_vel1, dev_vel2);
+
 }
 
 void Boids::stepSimulationScatteredGrid(float dt) {
   // TODO-2.1
   // Uniform Grid Neighbor search using Thrust sort.
-  // In Parallel:
-  // - label each particle with its array index as well as its grid index.
-  //   Use 2x width grids.
-  // - Unstable key sort using Thrust. A stable sort isn't necessary, but you
-  //   are welcome to do a performance comparison.
-  // - Naively unroll the loop for finding the start and end indices of each
-  //   cell's data pointers in the array of boid indices
-  // - Perform velocity updates using neighbor search
-  // - Update positions
-  // - Ping-pong buffers as needed
+
+    dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+    
+    kernResetIntBuffer << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_gridCellStartIndices, -1);
+    kernResetIntBuffer << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_gridCellEndIndices, -1);
+
+    kernComputeIndices <<<fullBlocksPerGrid, blockSize >>> (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth,
+    dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+
+    thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
+
+    kernIdentifyCellStartEnd << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+    kernUpdateVelNeighborSearchScattered << <fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth, dev_gridCellStartIndices, dev_gridCellEndIndices, dev_particleArrayIndices, dev_pos, dev_vel1, dev_vel2);
+    kernUpdatePos << <fullBlocksPerGrid, blockSize >> > (numObjects, dt, dev_pos, dev_vel1);
+    
+    std::swap(dev_vel1, dev_vel2);
 }
 
 void Boids::stepSimulationCoherentGrid(float dt) {
@@ -382,6 +716,26 @@ void Boids::stepSimulationCoherentGrid(float dt) {
   // - Perform velocity updates using neighbor search
   // - Update positions
   // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE.
+    dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+
+    kernResetIntBuffer << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_gridCellStartIndices, -1);
+    kernResetIntBuffer << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_gridCellEndIndices, -1);
+
+    kernComputeIndices << <fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth,
+        dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+
+    thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
+
+    kernFillSortedArray << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_sorted_pos, dev_pos, dev_particleArrayIndices);
+    kernFillSortedArray << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_sorted_vel, dev_vel1, dev_particleArrayIndices);
+
+    kernIdentifyCellStartEnd << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+    kernUpdateVelNeighborSearchCoherent << <fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth, dev_gridCellStartIndices, dev_gridCellEndIndices, dev_sorted_pos, dev_sorted_vel, dev_vel2);
+    kernUpdatePos << <fullBlocksPerGrid, blockSize >> > (numObjects, dt, dev_pos, dev_vel1);
+
+    std::swap(dev_vel1, dev_vel2);
+    std::swap(dev_pos, dev_sorted_pos);
+        
 }
 
 void Boids::endSimulation() {
@@ -390,6 +744,15 @@ void Boids::endSimulation() {
   cudaFree(dev_pos);
 
   // TODO-2.1 TODO-2.3 - Free any additional buffers here.
+  cudaFree(dev_particleArrayIndices);
+  cudaFree(dev_gridCellStartIndices);
+  cudaFree(dev_gridCellEndIndices);
+  cudaFree(dev_particleGridIndices);
+
+  cudaFree(dev_sorted_vel);
+  cudaFree(dev_sorted_pos);
+
+  //Not really sure what new things I need
 }
 
 void Boids::unitTest() {
@@ -404,15 +767,15 @@ void Boids::unitTest() {
   std::unique_ptr<int[]>intValues{ new int[N] };
 
   intKeys[0] = 0; intValues[0] = 0;
+  intKeys[6] = 2; intValues[6] = 6;
+  intKeys[7] = 0; intValues[7] = 7;
+  intKeys[8] = 5; intValues[8] = 8;
+  intKeys[9] = 6; intValues[9] = 9;
   intKeys[1] = 1; intValues[1] = 1;
   intKeys[2] = 0; intValues[2] = 2;
   intKeys[3] = 3; intValues[3] = 3;
   intKeys[4] = 0; intValues[4] = 4;
   intKeys[5] = 2; intValues[5] = 5;
-  intKeys[6] = 2; intValues[6] = 6;
-  intKeys[7] = 0; intValues[7] = 7;
-  intKeys[8] = 5; intValues[8] = 8;
-  intKeys[9] = 6; intValues[9] = 9;
 
   cudaMalloc((void**)&dev_intKeys, N * sizeof(int));
   checkCUDAErrorWithLine("cudaMalloc dev_intKeys failed!");
diff --git a/src/main.cpp b/src/main.cpp
index b82c8c6..c81ea5b 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -13,7 +13,7 @@
 // ================
 
 // LOOK-2.1 LOOK-2.3 - toggles for UNIFORM_GRID and COHERENT_GRID
-#define VISUALIZE 1
+#define VISUALIZE 0
 #define UNIFORM_GRID 0
 #define COHERENT_GRID 0