diff --git a/README.md b/README.md
index 0e38ddb..89518f8 100644
--- a/README.md
+++ b/README.md
@@ -3,12 +3,392 @@ CUDA Stream Compaction
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Nick Moon
+  * [LinkedIn](https://www.linkedin.com/in/nick-moon1/), [personal website](https://nicholasmoon.github.io/)
+* Tested on: Windows 10, AMD Ryzen 9 5900HS @ 3.0GHz 32GB, NVIDIA RTX 3060 Laptop 6GB (Personal Laptop)
 
-### (TODO: Your README)
+**This is a Stream Compaction algorithm implementation in C++ using CUDA for GPU acceleration. This allows for
+compacting arrays with millions of elements in parallel on the GPU.**
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+### Results
+
+*Testbench functional and performance results for scan and stream compaction on input array size of 2^25 (33554432):*
+```
+number of elements in input array for power of 2 tests = 33554432
+number of elements in input array for non-power of 2 tests = 33554429
+
+****************
+** SCAN TESTS **
+****************
+    [  34  41  19  15   6   8   5  37  47  15   4  40  20 ...  44  16   0 ]
+==== cpu scan, power-of-two ====
+   elapsed time: 19.1805ms    (std::chrono Measured)
+    [   0  34  75  94 109 115 123 128 165 212 227 231 271 ... 821815873 821815917 821815933 ]
+==== cpu scan, non-power-of-two ====
+   elapsed time: 19.9557ms    (std::chrono Measured)
+    [   0  34  75  94 109 115 123 128 165 212 227 231 271 ... 821815825 821815842 821815853 ]
+    passed
+==== naive scan, power-of-two ====
+   elapsed time: 29.1328ms    (CUDA Measured)
+    [   0  34  75  94 109 115 123 128 165 212 227 231 271 ... 821815873 821815917 821815933 ]
+    passed
+==== naive scan, non-power-of-two ====
+   elapsed time: 29.0673ms    (CUDA Measured)
+    [   0  34  75  94 109 115 123 128 165 212 227 231 271 ... 821815825 821815842 821815853 ]
+    passed
+==== work-efficient scan, power-of-two ====
+   elapsed time: 24.446ms    (CUDA Measured)
+    [   0  34  75  94 109 115 123 128 165 212 227 231 271 ... 821815873 821815917 821815933 ]
+    passed
+==== work-efficient scan, non-power-of-two ====
+   elapsed time: 21.2756ms    (CUDA Measured)
+    [   0  34  75  94 109 115 123 128 165 212 227 231 271 ... 821815825 821815842 821815853 ]
+    passed
+==== thrust scan, power-of-two ====
+   elapsed time: 1.16003ms    (CUDA Measured)
+    [   0  34  75  94 109 115 123 128 165 212 227 231 271 ... 821815873 821815917 821815933 ]
+    passed
+==== thrust scan, non-power-of-two ====
+   elapsed time: 1.17952ms    (CUDA Measured)
+    [   0  34  75  94 109 115 123 128 165 212 227 231 271 ... 821815825 821815842 821815853 ]
+    passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [   2   3   3   0   0   1   1   0   3   1   0   1   1 ...   2   0   0 ]
+==== cpu compact without scan, power-of-two ====
+   elapsed time: 51.9551ms    (std::chrono Measured)
+    [   2   3   3   1   1   3   1   1   1   3   2   2   2 ...   2   3   2 ]
+    passed
+==== cpu compact without scan, non-power-of-two ====
+   elapsed time: 51.9623ms    (std::chrono Measured)
+    [   2   3   3   1   1   3   1   1   1   3   2   2   2 ...   3   2   3 ]
+    passed
+==== cpu compact with scan ====
+   elapsed time: 103.018ms    (std::chrono Measured)
+    [   2   3   3   1   1   3   1   1   1   3   2   2   2 ...   2   3   2 ]
+    passed
+==== work-efficient compact, power-of-two ====
+   elapsed time: 25.3891ms    (CUDA Measured)
+    [   2   3   3   1   1   3   1   1   1   3   2   2   2 ...   2   3   2 ]
+    passed
+==== work-efficient compact, non-power-of-two ====
+   elapsed time: 25.2508ms    (CUDA Measured)
+    [   2   3   3   1   1   3   1   1   1   3   2   2   2 ...   3   2   3 ]
+    passed
+```
+
+
+## Implementation
+
+### Array Scan
+
+The array scan algorithm is an algorithm that, given an input array of ints ```idata``` of size n, 
+will output an an array ```odata``` with size also n, where the value at each 
+index ```i``` in ```odata``` is the sum of all previous ```i-1``` elements from ```idata```.
+Exclusive scan is the version implemented in this repository, as opposed to Inclusive. This means 
+setting ```odata[0] = 0``` and each element in ```odata``` not including the addition of 
+its correpsonding value in ```idata``` when computing the value. Below each of the different
+implementations of the exclusive scan algorithm are talked in more detail.
+
+#### CPU Implementation
+
+The CPU implementation implements the array scan algorithm in a serial format, according to the pseudocode:
+
+```
+function scan_cpu(input_array, output_array, number_of_elements):
+
+    output_array[0] = 0
+    for i in range [1, number_of_elements):
+        output_array[i] = output_array[i - 1] + input_array[i - 1]
+
+end
+```
+
+#### Naive CUDA Implemenation
+
+The Naive CUDA implementation maps the original cpu scan approach and maps it to parallel GPU hardware.
+This necessitates the use of double-buffering in order to avoid race conditions. This means swapping the
+two gpu arrays that are used as input and output each iteration.
+
+```
+function scan_gpu(input_array, output_array, number_of_elements):
+
+    let o_array_gpu_0 = input_array (this is an array on the GPU)
+    let o_array_gpu_1 = o_array_gpu_0 (this is an array on the GPU)
+
+    for all k in parallel:
+        shift_array_right(k, o_array_gpu_1, o_array_gpu_0)
+    swap(o_array_gpu_0, o_array_gpu_1)
+
+    for d in range [1, ceil( log_2(n) ) ]:
+        for all k in parallel:
+            naive_scan_iteration(k, o_array_gpu_0, o_array_gpu_1, 2^d)
+        swap(o_array_gpu_0, o_array_gpu_1)
+
+    output_array = o_array_gpu_0
+
+end
+
+kernel shift_array_right(thread_ID, input_array, output_array):
+    if thread_ID_ == 0 then:
+        output_array[0] = 0
+    else:
+        output_array[thread_ID] = input_array[thread_ID - 1]
+
+end
+
+kernel naive_scan_iteration(thread_ID, input_array, output_array, offset):
+    if (thread_ID < offset) then:
+        output_array[thread_ID] = input_array[thread_ID]
+    else:
+        output_array[thread_ID] = input_array[thread_ID - offset] + input_array[thread_ID]
+    
+end
+```
+
+#### Work-Efficient CUDA Implementation
+
+To make the parallel approach more efficient, a different scheme is used. By treating the array
+as a balanced binary tree, the new approach seperates out the algorithm into two steps: the
+up-sweep and the down-sweep. The up-sweep is just a parallel reduction, which just stores the
+sum of an array in the last index. A picture of the up-sweep on an example array is shown in Figure
+1 below:
+
+![](images/figures/scan_upsweep.PNG)
+*Figure 1: Demonstration of efficient scan up-sweep operation.*
+
+After the up-sweep, the element at the last index (the sum of the array) is zeroed out, and then
+the down-sweep occurs, as seen in Figure 2 below:
+
+![](images/figures/scan_downsweep.PNG)
+*Figure 2: Demonstration of efficient scan down-sweep operation.*
+
+#### Thrust Implementation
+
+For the thrust implementation, the input and output arrays are simply converted to thrust library
+```device_vector```s and the thrust ```exclusive_scan()``` function is called with them. The thrust device vector
+for the output data is then transferred back to the host array.
+
+### Stream Compaction
+
+Stream compaction is an array algorithm that, given an input array of ints ```idata``` of size ```n```,
+returns an output array ```odata``` of some size ```[0,n]``` such that ```odata``` contains only the
+values ```x``` in ```idata``` that satisfy some criteria function ```f(x)```. This is essentially 
+used to compact an array into a smaller size by getting rid of unneeded elements as determined by 
+the criteria function ```f(x)```. Values for which ```f(x)``` return ```true``` are kept, while
+values for which ```f(x)``` return false are removed.
+
+#### CPU Implementation (without scan)
+
+The CPU implementation (without scan) implements the stream compaction algorithm in a serial format,
+according to the pseudocode:
+
+```
+function stream_compact_cpu_no_scan(input_array, output_array, number_of_elements):
+
+    let num_compacted_elements = 0
+    for i in range [0, number_of_elements):
+        if input_array[i] != 0 then:
+            output_array[num_compacted_elements] = input_array[i]
+            num_compacted_elements = num_compacted_elements + 1
+
+    return num_elements
+
+end
+```
+
+This method keeps a rolling index while looping through the input array that keeps track of what index
+into the output array to write to, only writing to that index when a value in the input array
+satisfies the condition function, which in this case is ```x != 0```.
+
+#### CPU Implementation (with scan)
+
+The CPU implementation (with scan) implements the stream compaction algorithm in a "pseudo"-parallel format,
+meaning that the structure of the algorithm mimics what will be used for the CUDA implementation,
+but still operates on the array serially. This involves a (serial) pass to check for each element in
+the input array that is not 0, running a (serial) scan on this data, and then running a (serial) scatter
+on the scan result to build the final output array. The psuedocode is below:
+
+```
+function stream_compact_cpu_scan(input_array, output_array, number_of_elements):
+
+    let num_compacted_elements = 0
+
+    // GENERATE CONDITION VALUES
+
+    for i in range [0, number_of_elements):
+        if input_array[i] == 0 then:
+            output_array[i] = 0
+        else:
+            output_array[i] = 1
+
+    // SCAN
+
+    let temp_array[number_of_elements]
+    temp_array[0] = 0;
+
+    for i in range [1, number_of_elements):
+        temp_array[i] = temp_array[i - 1] + output_array[i - 1]
+
+    // SCATTER
+
+    for i in range [0, number_of_elements):
+        if output_array[i] == 1 then:
+            output_array[temp_array[i]] = input_array[i];
+            ++num_compacted_elements;
+        }
+
+    return num_compacted_elements
+
+end
+```
+
+#### Efficient Parallel Implementation
+
+The GPU version of this algorithm directly models the CPU with scan version, except this time the 3
+steps, the condition check, the scan, and the scatter, are all performed on the GPU. The scan
+function is the same as for the efficient parallel scan from the previous section, while the pseudocode
+for the other two functions are shown below:
+
+```
+
+kernel map_to_boolean(thread_ID, input_array, output_array):
+
+    if input_array[thread_ID] == 0 then:
+        output_array[thread_ID] = 0
+    else:
+        output_array[thread_ID] = 1
+
+end
+
+kernel scatter(thread_ID, input_array, output_array, boolean_array, indices_array):
+
+    if boolean_array[thread_ID] == 1 then:
+        output_data[indices_array[thread_ID]] = input_array[thread_ID]
+    
+end
+```
+
+### Extra Credit
+
+I implemented the CPU radix sort but did not end up finishing my GPU version.
+
+## Testing Strategy
+
+**All tests and results are done on Release mode in Visual Studio**
+
+The first step in generating results was to figure out the optimal block size for the 
+different GPU implementations of the scan and stream compaction algorithms. Data from each
+implementation was collected with a constant input array size of 2^25 (33,554,432) for
+powers of two block sizes from 32 to 1024, and the results are shown in Figure 3 below.
+
+![](images/figures/graph_blocksize.png)
+*Figure 3: Effect of CUDA block size on runtime of scan and stream compaction.*
+
+As seen from the graph, for both exclusive scan and stream compaction, the optimal block size is
+around 128-256. Below these values, the runtime shoots up exponentially with decreasing block size,
+getting as high as almost 4x worse performance at a block size of 32 compared to 256 for the parallel
+stream compaction on an array with a size not a power of 2.
+Similarly, the performance also increases with increasing block size past 256, at least for all the
+algorithms save the naive scan. This growth appears roughly linear for the other algorithms,
+while the runtime stays roughly constant for the naive scan.
+
+To be more specific, the block size with the lowest runtime and thus the **optimal block size** of
+the **naive parallel scan is 128**, and for **all others is 256**. These values will thus be used to test
+and retrieve results for the forthcoming topics of discussion.
+
+In the **Performance Analysis** section below, all data is from the ***Not Power of 2 Array Size*** tests.
+This is because this is the most general test case for analyzing these algorithms, as most often an
+array will not fit neatly into the CUDA blocks. Additionally, there is negligable difference in the
+runtimes between the power of 2 and non-power of 2 tests. This can be seen in Figure 4 below:
+
+![](images/figures/graph_pow2.png)
+*Figure 4: Demonstrating runtime comparison between tests on arrays with power of 2 size and not.*
+
+## Performance Analysis
+
+**All tests and results are done on Release mode in Visual Studio**
+
+### Scan
+
+As can be seen by Figure 5 below, the runtime of the scan algorithm increases roughly linearly for each of the
+different implementations, but the slope of this increase is different for each one. 
+
+The thrust version was by far the fastest in most cases and had the smallest slope. 
+In all test cases, the thrust version maintained a runtime below 3ms. This makes sense, as thrust is
+a highly optimized GPU and parallel algorithm library developed for high performance applications.
+As to its amazing performance relative the CUDA implementations found in this repository,
+that will be discussed further down the readme.
+
+The next best version was the CPU scan. While this is initially suprising, there are a couple benefits
+the CPU version has with respect to the GPU versions. First, the logic is very simple for computing
+each element of the output, and the serial nature of the algorithm means it is very cache coherent.
+It also requires no additional memory allocation (CPU or GPU), which all the parallel versions require.
+
+The work-efficient parallel scan is the next best version. This is slower than the CPU and thrust versions,
+but faster than the naive parallel version in the long run. The reason this is slower than the CPU
+version is, first of all, because it is just using global memory. This is the GPU memory with the highest
+latency, and, especially when compared with the CPU version's cache coherency, is a huge bottleneck
+on the algorithm. Another problem is that the number of threads remains constant each iteration of the
+algorithm. This means that, during both the up sweep and down sweep, there are potentially millions of
+threads allocated for a process (i.e. one of the last data points on the graph), while only 1 or a few
+are actually active at a time. A more optimized version of the algorithm would allocate only the
+threads that are needed.
+
+Finally, the naive parallel scan is the worst version. For 33 million array size, this algorithm
+is 50% worse than the efficient parallel scan, and 30x slower than the thrust library implementation. 
+This is mostly due to a couple factors. First, the naive parallel algorithm never decreases the amount
+of threads (number of blocks) launched for the kernels, which means at the final iteration, there are
+millions of threads (for the final data point) being launched, and only half of them are actually doing
+work. Another problem is that the number of active threads for each iteration is only being decreased
+by a number that doubles each time. This means that, compared with the efficient version that halves
+the number of active threads each iteration, the naive approach will only decrease the number by a
+power of two. A final problem with this approach is that it requires double buffering, meaning
+at least double the memory of the efficient version, along with the memory latency that comes with
+that much additional memory usage.
+
+![](images/figures/graph_scan.png)
+*Figure 5 Effect of input array size on runtime of scan algorithm.*
+
+##### Thrust NSight Analysis
+
+Taking a little deeper look into the Thrust version of Exclusive Scan, the NSight profiling timeline
+seen in Figures 6 and 7
+reveal that it consists of at least 1 ```cudaEventCreate``` API call, as well as 3 ```cudaMemcpyAsync```
+calls. Two of these ```cudaMemcpyAsync``` calls are performed as part of the casting the input and output
+arrays from C style arrays to Thrust ```device_vector```,  and the third is from retrieving the output
+data from the output ```device_vector``` and placing it into the output C style array. In between
+the first two ```cudaMemcpyAsync``` calls and the third, are ```cudaMalloc```, 
+```cudaStreamSynchronize```, and ```cudaFree``` function calls. This is where the actual implementation
+of the Thrust Exclusive Scan function is, where it allocates device memory, operates on the data, and then
+frees it at the end.
+
+![](images/results/thrust_nsight_timeline_memcpy.PNG)
+*Figure 6: NSight Timeline of Thrust Exclusive Scan operation highlighting cudaMemcpyAsync.*
+
+![](images/results/thrust_nsight_timeline_streamsync.PNG)
+*Figure 7: NSight Timeline of Thrust Exclusive Scan operation highlighting cudaStreamSynchronize.*
+
+### Stream Compaction
+
+Likewise, Figure 8 below shows the runtime of the stream compaction algorithm also increases linearly
+for each of the implementations, and again the slope of this increase is the only thing that changes.
+However, unlike with scan, here the GPU stream compaction is significantly faster than either of the
+CPU implementations for arrays with very large amounts of elements (>2000000). This is because
+the CPU version (without scan), even though it is logically barely more complex than the CPU scan (which
+was faster than the GPU scans), has a conditional in the for loop. Because the array the stream 
+compaction input array has elements which are not related in any way to each other, in the worst
+case the array would have alternating elements that are ```0``` and not ```0```, causing the branch
+prediction in the CPU to cause large performance penalties with increasing array sizes. By parallelizing
+these conditionals, the GPU implementation of stream compaction has much faster runtime for array sizes
+in the millions.
+
+![](images/figures/graph_compact.png)
+*Figure 8: Effect of input array size on runtime of stream compaction algorithm*
+
+### Bloopers
+
+No bloopers for this assignment!
 
diff --git a/images/figures/graph_blocksize.png b/images/figures/graph_blocksize.png
new file mode 100644
index 0000000..4d3aa86
Binary files /dev/null and b/images/figures/graph_blocksize.png differ
diff --git a/images/figures/graph_compact.png b/images/figures/graph_compact.png
new file mode 100644
index 0000000..e9d22c8
Binary files /dev/null and b/images/figures/graph_compact.png differ
diff --git a/images/figures/graph_pow2.png b/images/figures/graph_pow2.png
new file mode 100644
index 0000000..c11ad8f
Binary files /dev/null and b/images/figures/graph_pow2.png differ
diff --git a/images/figures/graph_scan.png b/images/figures/graph_scan.png
new file mode 100644
index 0000000..83b0336
Binary files /dev/null and b/images/figures/graph_scan.png differ
diff --git a/images/figures/scan_downsweep.PNG b/images/figures/scan_downsweep.PNG
new file mode 100644
index 0000000..04ac8ed
Binary files /dev/null and b/images/figures/scan_downsweep.PNG differ
diff --git a/images/figures/scan_upsweep.PNG b/images/figures/scan_upsweep.PNG
new file mode 100644
index 0000000..60ee4e5
Binary files /dev/null and b/images/figures/scan_upsweep.PNG differ
diff --git a/images/results/compact_15.PNG b/images/results/compact_15.PNG
new file mode 100644
index 0000000..ac48df9
Binary files /dev/null and b/images/results/compact_15.PNG differ
diff --git a/images/results/compact_20.PNG b/images/results/compact_20.PNG
new file mode 100644
index 0000000..3fde06f
Binary files /dev/null and b/images/results/compact_20.PNG differ
diff --git a/images/results/compact_25.PNG b/images/results/compact_25.PNG
new file mode 100644
index 0000000..4d7feb8
Binary files /dev/null and b/images/results/compact_25.PNG differ
diff --git a/images/results/scan_15.PNG b/images/results/scan_15.PNG
new file mode 100644
index 0000000..b51ec32
Binary files /dev/null and b/images/results/scan_15.PNG differ
diff --git a/images/results/scan_20.PNG b/images/results/scan_20.PNG
new file mode 100644
index 0000000..cc29ce8
Binary files /dev/null and b/images/results/scan_20.PNG differ
diff --git a/images/results/scan_25.PNG b/images/results/scan_25.PNG
new file mode 100644
index 0000000..3743639
Binary files /dev/null and b/images/results/scan_25.PNG differ
diff --git a/images/results/thrust_nsight_timeline_memcpy.PNG b/images/results/thrust_nsight_timeline_memcpy.PNG
new file mode 100644
index 0000000..a3ed5bd
Binary files /dev/null and b/images/results/thrust_nsight_timeline_memcpy.PNG differ
diff --git a/images/results/thrust_nsight_timeline_streamsync.PNG b/images/results/thrust_nsight_timeline_streamsync.PNG
new file mode 100644
index 0000000..0e9b0d3
Binary files /dev/null and b/images/results/thrust_nsight_timeline_streamsync.PNG differ
diff --git a/src/main.cpp b/src/main.cpp
index 896ac2b..531eacf 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -13,11 +13,11 @@
 #include <stream_compaction/thrust.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 8; // feel free to change the size of array
+const int SIZE = 1 << 20; // feel free to change the size of array
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
-int *a = new int[SIZE];
-int *b = new int[SIZE];
-int *c = new int[SIZE];
+int* a = new int[SIZE];
+int* b = new int[SIZE];
+int* c = new int[SIZE];
 
 int main(int argc, char* argv[]) {
     // Scan tests
@@ -51,7 +51,7 @@ int main(int argc, char* argv[]) {
     printDesc("naive scan, power-of-two");
     StreamCompaction::Naive::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
@@ -64,35 +64,35 @@ int main(int argc, char* argv[]) {
     printDesc("naive scan, non-power-of-two");
     StreamCompaction::Naive::scan(NPOT, c, a);
     printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient scan, power-of-two");
     StreamCompaction::Efficient::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient scan, non-power-of-two");
     StreamCompaction::Efficient::scan(NPOT, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(NPOT, c, true);
+    printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
 
     zeroArray(SIZE, c);
     printDesc("thrust scan, power-of-two");
     StreamCompaction::Thrust::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);
     printDesc("thrust scan, non-power-of-two");
     StreamCompaction::Thrust::scan(NPOT, c, a);
     printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(NPOT, c, true);
+    printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
 
     printf("\n");
@@ -137,16 +137,48 @@ int main(int argc, char* argv[]) {
     printDesc("work-efficient compact, power-of-two");
     count = StreamCompaction::Efficient::compact(SIZE, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
+    printArray(count, c, true);
     printCmpLenResult(count, expectedCount, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient compact, non-power-of-two");
     count = StreamCompaction::Efficient::compact(NPOT, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
+    printArray(count, c, true);
     printCmpLenResult(count, expectedNPOT, b, c);
 
+
+    // RADIX SORT ON GPU NOT IMPLEMENTED
+    /*
+    printf("\n");
+    printf("*****************************\n");
+    printf("** RADIX SORT TESTS **\n");
+    printf("*****************************\n");
+
+    genArray(SIZE - 1, a, 100);  // Leave a 0 at the end to test that edge case
+    a[SIZE - 1] = 0;
+    printArray(SIZE, a, true);
+
+    zeroArray(SIZE, b);
+    printDesc("cpu std::stable_sort, power-of-two");
+    StreamCompaction::CPU::stdSort(SIZE, b, a);
+    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
+    printArray(SIZE, b, true);
+
+    zeroArray(SIZE, c);
+    printDesc("cpu radix sort, power-of-two");
+    StreamCompaction::CPU::radixSort(SIZE, c, a);
+    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
+    //printArray(SIZE, c, true);
+    printCmpResult(SIZE, b, c);
+
+    zeroArray(SIZE, c);
+    printDesc("gpu radix sort, power-of-two");
+    StreamCompaction::Efficient::radixSort(SIZE, c, a);
+    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+    //printArray(SIZE, c, true);
+    printCmpResult(SIZE, b, c);*/
+
     system("pause"); // stop Win32 console from closing on exit
     delete[] a;
     delete[] b;
diff --git a/src/testing_helpers.hpp b/src/testing_helpers.hpp
index 025e94a..021ba09 100644
--- a/src/testing_helpers.hpp
+++ b/src/testing_helpers.hpp
@@ -61,7 +61,7 @@ void printArray(int n, int *a, bool abridged = false) {
     printf("    [ ");
     for (int i = 0; i < n; i++) {
         if (abridged && i + 2 == 15 && n > 16) {
-            i = n - 2;
+            i = n - 3;
             printf("... ");
         }
         printf("%3d ", a[i]);
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
index 2ed6d63..232b38d 100644
--- a/stream_compaction/common.cu
+++ b/stream_compaction/common.cu
@@ -23,7 +23,18 @@ namespace StreamCompaction {
          * which map to 0 will be removed, and elements which map to 1 will be kept.
          */
         __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
-            // TODO
+
+            int thread_num = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (thread_num >= n) {
+                return;
+            }
+
+            if (idata[thread_num] == 0) {
+                bools[thread_num] = 0;
+            }
+            else {
+                bools[thread_num] = 1;
+            }
         }
 
         /**
@@ -32,8 +43,44 @@ namespace StreamCompaction {
          */
         __global__ void kernScatter(int n, int *odata,
                 const int *idata, const int *bools, const int *indices) {
-            // TODO
+
+            int thread_num = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (thread_num >= n) {
+                return;
+            }
+
+            if (bools[thread_num] == 1) {
+                odata[indices[thread_num]] = idata[thread_num];
+            }
         }
 
+
+        /**
+         * Maps an array to an array of 0s and 1s for radix sort. Elements
+         * whose value in bit b is 1 are mapped to 1, otherwise 0
+         */
+        __global__ void kernMapToBooleanBitwiseCheck(int n, int c, int* bools, const int* idata) {
+            int thread_num = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (thread_num >= n) {
+                return;
+            }
+
+            if (idata[thread_num] & c) {
+                bools[thread_num] = 0;
+            }
+            else {
+                bools[thread_num] = 1;
+            }
+        }
+
+
+        __global__ void kernReverseArray(int n, int* odata_reversed, const int* odata) {
+            int thread_num = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (thread_num >= n) {
+                return;
+            }
+
+            odata_reversed[thread_num] = odata[n - thread_num - 1];
+        }
     }
 }
diff --git a/stream_compaction/common.h b/stream_compaction/common.h
index d2c1fed..c2b9c77 100644
--- a/stream_compaction/common.h
+++ b/stream_compaction/common.h
@@ -9,10 +9,16 @@
 #include <algorithm>
 #include <chrono>
 #include <stdexcept>
+#include <iostream>
+#include <vector>
 
 #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
 #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
 
+#define blockSize 128
+
+//#define inclusiveToExclusive
+
 /**
  * Check for CUDA errors; print and exit if there was a problem.
  */
@@ -37,6 +43,10 @@ namespace StreamCompaction {
         __global__ void kernScatter(int n, int *odata,
                 const int *idata, const int *bools, const int *indices);
 
+        __global__ void kernMapToBooleanBitwiseCheck(int n, int c, int* bools, const int* idata);
+
+        __global__ void kernReverseArray(int n, int* odata_reversed, const int* odata);
+
         /**
         * This class is used for timing the performance
         * Uncopyable and unmovable
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index 719fa11..c334ae4 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -19,7 +19,10 @@ namespace StreamCompaction {
          */
         void scan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
-            // TODO
+            odata[0] = 0;
+            for (int i = 1; i < n; ++i) {
+                odata[i] = odata[i - 1] + idata[i - 1];
+            }
             timer().endCpuTimer();
         }
 
@@ -30,9 +33,15 @@ namespace StreamCompaction {
          */
         int compactWithoutScan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
-            // TODO
+            int num_elements = 0;
+            for (int i = 0; i < n; ++i) {
+                if (idata[i] != 0) {
+                    odata[num_elements] = idata[i];
+                    ++num_elements;
+                }
+            }
             timer().endCpuTimer();
-            return -1;
+            return num_elements;
         }
 
         /**
@@ -41,10 +50,139 @@ namespace StreamCompaction {
          * @returns the number of elements remaining after compaction.
          */
         int compactWithScan(int n, int *odata, const int *idata) {
+            int* temp_array = new int[n];
+
             timer().startCpuTimer();
-            // TODO
+            
+            int num_elements = 0;
+            // GENERATE CONDITION VALUES
+            for (int i = 0; i < n; ++i) {
+                if (idata[i] == 0) {
+                    odata[i] = 0;
+                }
+                else {
+                    odata[i] = 1;
+                }
+            }
+
+            // SCAN
+            temp_array[0] = 0;
+            for (int i = 1; i < n; ++i) {
+                temp_array[i] = temp_array[i - 1] + odata[i - 1];
+            }
+
+            // SCATTER
+            for (int i = 0; i < n; ++i) {
+                if (odata[i] == 1) {
+                    odata[temp_array[i]] = idata[i];
+                    ++num_elements;
+                }
+            }
+            
             timer().endCpuTimer();
-            return -1;
+
+            delete[] temp_array;
+
+            return num_elements;
+        }
+
+        /**
+         * CPU radix sort implementation
+         *
+         * @param n      The number of elements in idata.
+         * @param odata  The array into which to store elements.
+         * @param idata  The array of elements to sort.
+         */
+        void radixSort(int n, int* odata, const int* idata) {
+            // not the greatest cpu implementation of radix sort (quite memory intensive), but functional
+
+            int* temp_array_0 = new int[n * 2];
+            int* temp_array_1 = new int[n * 2];
+
+            for (int i = 0; i < n; ++i) {
+                temp_array_0[i] = idata[i];
+            }
+
+            int index_left_partition = 0;
+            int index_right_partition = 0;
+            int num_left_partition = n;
+            int num_right_partition = 0;
+
+            timer().startCpuTimer();
+
+            for (int b = 0; b < sizeof(int) * 8; ++b) {
+                // handle left partition
+                for (int i = 0; i < num_left_partition; ++i) {
+                    if (!(temp_array_0[i] & (1 << b))) {
+                        temp_array_1[index_left_partition] = temp_array_0[i];
+                        index_left_partition++;
+                    }
+                    else {
+                        temp_array_1[index_right_partition + n] = temp_array_0[i];
+                        index_right_partition++;
+                    }
+                }
+
+                //handle right partition
+                for (int i = n; i < n + num_right_partition; ++i) {
+                    if (!(temp_array_0[i] & (1 << b))) {
+                        temp_array_1[index_left_partition] = temp_array_0[i];
+                        index_left_partition++;
+                    }
+                    else {
+                        temp_array_1[index_right_partition + n] = temp_array_0[i];
+                        index_right_partition++;
+                    }
+                }
+
+                int* temp = temp_array_0;
+                temp_array_0 = temp_array_1;
+                temp_array_1 = temp;
+
+                num_left_partition = index_left_partition;
+                num_right_partition = index_right_partition;
+                index_left_partition = 0;
+                index_right_partition = 0;
+            }
+
+
+
+            timer().endCpuTimer();
+
+            // handle left partition
+            int odata_index = 0;
+            for (int i = 0; i < num_left_partition; ++i) {
+                odata[odata_index] = temp_array_0[i];
+                odata_index++;
+            }
+
+            //handle right partition
+            for (int i = n; i < n + num_right_partition; ++i) {
+                odata[odata_index] = temp_array_0[i];
+                odata_index++;
+            }
+
+            delete[] temp_array_0;
+            delete[] temp_array_1;
+        }
+
+        /**
+         * CPU radix sort implementation using std::stable_sort
+         *
+         * @param n      The number of elements in idata.
+         * @param odata  The array into which to store elements.
+         * @param idata  The array of elements to sort.
+         */
+        void stdSort(int n, int* odata, const int* idata) {
+            std::vector<int> vect_idata(idata, idata + n);
+
+            timer().startCpuTimer();
+            std::stable_sort(vect_idata.begin(), vect_idata.end());
+            timer().endCpuTimer();
+
+            for (int i = 0; i < n; ++i) {
+                odata[i] = vect_idata[i];
+            }
         }
     }
 }
diff --git a/stream_compaction/cpu.h b/stream_compaction/cpu.h
index 873c047..f1eb1a4 100644
--- a/stream_compaction/cpu.h
+++ b/stream_compaction/cpu.h
@@ -11,5 +11,9 @@ namespace StreamCompaction {
         int compactWithoutScan(int n, int *odata, const int *idata);
 
         int compactWithScan(int n, int *odata, const int *idata);
+
+        void radixSort(int n, int* odata, const int* idata);
+
+        void stdSort(int n, int* odata, const int* idata);
     }
 }
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 2db346e..7b822ef 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -12,15 +12,202 @@ namespace StreamCompaction {
             return timer;
         }
 
+        __global__ void kern_ComputeEfficientExclusiveScanUpSweepIteration(int N, int offset_d, int offset_d_plus_1, int* odata) {
+            int thread_num = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (thread_num >= N) {
+                return;
+            }
+
+            if (thread_num % offset_d_plus_1 == 0) {
+                odata[thread_num + offset_d_plus_1 - 1] += odata[thread_num + offset_d - 1];
+            }
+
+        }
+
+        __global__ void kern_ComputeEfficientExclusiveScanDownSweepIteration(int N, int offset_d, int offset_d_plus_1, int* odata) {
+            int thread_num = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (thread_num >= N) {
+                return;
+            }
+
+            if (thread_num % offset_d_plus_1 == 0) {
+                int t = odata[thread_num + offset_d - 1];
+                odata[thread_num + offset_d - 1] = odata[thread_num + offset_d_plus_1 - 1];
+                odata[thread_num + offset_d_plus_1 - 1] += t;
+            }
+
+        }
+
+        __global__ void kern_ComputeEfficientExclusiveScanUpSweepIteration_optimized(int N, int N_over_2, int num_active_threads, int offset_d, int offset_d_plus_1, int* odata) {
+            int thread_num = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (thread_num >= num_active_threads) {
+                return;
+            }
+
+            int reversed_thread_num = (N - thread_num) - 1;
+            int reversed_offset = (N - offset_d) - 1;
+
+            if (reversed_thread_num > reversed_offset) {
+                odata[reversed_thread_num] += odata[reversed_thread_num - offset_d];
+            }
+        }
+
+        __global__ void kern_ComputeEfficientExclusiveScanDownSweepIteration_optimized(int N, int num_active_threads, int offset_d, int offset_d_plus_1, int* odata) {
+            int thread_num = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (thread_num >= N) {
+                return;
+            }
+
+            int reversed_thread_num = (N - thread_num) - 1;
+            int reversed_offset = (N - offset_d) - 1;
+
+            if (reversed_thread_num > reversed_offset) {
+                int t = odata[reversed_thread_num - offset_d];
+                odata[reversed_thread_num - offset_d] = odata[reversed_thread_num];
+                odata[reversed_thread_num] += t;
+            }
+
+        }
+
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
-        void scan(int n, int *odata, const int *idata) {
+        void scan(int n, int* odata, const int* idata) {
+            int* dev_odata;
+
+            // need to pad out original array to nearest power of two size
+            int old_n = n;
+            n = pow(2, ilog2ceil(n));
+
+            cudaMalloc((void**)&dev_odata, n * sizeof(int));
+            cudaDeviceSynchronize();
+
+            // copy original data to GPU
+            cudaMemcpy(dev_odata, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+            // pad out rest of the expanded length with 0s
+            cudaMemset(dev_odata + (old_n), 0, sizeof(int) * (n - old_n));
+            cudaDeviceSynchronize();
+
             timer().startGpuTimer();
-            // TODO
+
+            int numBlocks = (n + blockSize - 1) / blockSize;
+
+            int num_iterations = ilog2ceil(n);
+
+            // UPSWEEP
+            for (int d = 0; d <= num_iterations - 1; ++d) {
+                // compute upsweep iteration d
+                kern_ComputeEfficientExclusiveScanUpSweepIteration << < numBlocks, blockSize >> > (n, pow(2, d), pow(2, d + 1), dev_odata);
+                cudaDeviceSynchronize();
+            }
+
+            // set last element in array to 0
+            cudaMemset(dev_odata + n - 1, 0, sizeof(int));
+            cudaDeviceSynchronize();
+
+            // DOWNSWEEP
+            for (int d = num_iterations - 1; d >= 0; --d) {
+                // compute downsweep iteration d
+                kern_ComputeEfficientExclusiveScanDownSweepIteration << < numBlocks, blockSize >> > (n, pow(2, d), pow(2, d + 1), dev_odata);
+                cudaDeviceSynchronize();
+            }
+
             timer().endGpuTimer();
+
+            // copy over array from device to output array on cpu, copying only original length data
+            cudaMemcpy(odata, dev_odata, sizeof(int) * old_n, cudaMemcpyDeviceToHost);
+            cudaDeviceSynchronize();
+
+            cudaFree(dev_odata);
+            cudaDeviceSynchronize();
         }
 
+
+        /**
+         * Performs prefix-sum (aka scan) on idata, storing the result into odata.
+         *
+         * This is a version of Efficient::Scan that is optimized as part of EXTRA CREDIT in
+         * Part 5: Why is My GPU Approach So Slow?
+         */
+        void scan_optimized(int n, int* odata, const int* idata) {
+            /*int* dev_odata;
+
+            // need to pad out original array to nearest power of two size
+            int old_n = n;
+            n = pow(2, ilog2ceil(n));
+
+            cudaMalloc((void**)&dev_odata, n * sizeof(int));
+            cudaDeviceSynchronize();
+
+            // copy original data to GPU
+            cudaMemcpy(dev_odata, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+            // pad out rest of the expanded length with 0s
+            cudaMemset(dev_odata + (old_n), 0, sizeof(int) * (n - old_n));
+            cudaDeviceSynchronize();
+
+            timer().startGpuTimer();
+
+            int numBlocks = (n + blockSize - 1) / blockSize;
+
+            int num_iterations = ilog2ceil(n);
+
+            int num_active_threads = n;
+
+            // UPSWEEP
+            for (int d = num_iterations; d > 0; --d) {
+                // compute upsweep iteration d
+                num_active_threads /= 2;
+                numBlocks = (num_active_threads + blockSize - 1) / blockSize;
+                //std::cout << num_active_threads << std::endl;
+                int reversed_thread_num = (n - 0) - 1;
+                int reversed_offset = (n - num_active_threads) - 1;
+                //std::cout << "fe: " << reversed_thread_num << " " << num_active_threads << " used_index: " << reversed_thread_num - num_active_threads << std::endl;
+                kern_ComputeEfficientExclusiveScanUpSweepIteration_optimized << < numBlocks, blockSize >> > (n, n / 2, num_active_threads, num_active_threads, pow(2, d + 1), dev_odata);
+                cudaDeviceSynchronize();
+            }
+            
+            // set last element in array to 0
+            cudaMemset(dev_odata + n - 1, 0, sizeof(int));
+            cudaDeviceSynchronize();
+
+            numBlocks = (n + blockSize - 1) / blockSize;
+
+            num_active_threads = 1;
+
+            // DOWNSWEEP
+            for (int d = 0; d < num_iterations; ++d) {
+                // compute downsweep iteration d
+                
+                numBlocks = (num_active_threads + blockSize - 1) / blockSize;
+                kern_ComputeEfficientExclusiveScanDownSweepIteration_optimized <<< numBlocks, blockSize >>> (n, num_active_threads, num_active_threads, pow(2, d + 1), dev_odata);
+                cudaDeviceSynchronize();
+                num_active_threads *= 2;
+            }
+            
+            timer().endGpuTimer();
+
+            // copy over array from device to output array on cpu, copying only original length data
+            cudaMemcpy(odata, dev_odata, sizeof(int) * old_n, cudaMemcpyDeviceToHost);
+            cudaDeviceSynchronize();
+
+            cudaFree(dev_odata);
+            cudaDeviceSynchronize();*/
+        }
+
+
+        /**
+         * Performs prefix-sum (aka scan) on idata, storing the result into odata.
+         *
+         * This is a version of Efficient::Scan that is optimized using shared memory as part of EXTRA CREDIT in
+         * Part 6: GPU Scan Using Shared Memory && Hardware Optimization
+         */
+        void scan_sharedMem(int n, int* odata, const int* idata) {
+
+        }
+
+
+
+
         /**
          * Performs stream compaction on idata, storing the result into odata.
          * All zeroes are discarded.
@@ -30,11 +217,199 @@ namespace StreamCompaction {
          * @param idata  The array of elements to compact.
          * @returns      The number of elements remaining after compaction.
          */
-        int compact(int n, int *odata, const int *idata) {
+        int compact(int n, int* odata, const int* idata) {
+            int* dev_idata;
+            int* dev_odata;
+            int* dev_boolean_map;
+            int* dev_indices;
+
+            // need to pad out original array to nearest power of two size
+            int old_n = n;
+            n = pow(2, ilog2ceil(n));
+
+            cudaMalloc((void**)&dev_idata, n * sizeof(int));
+            cudaMalloc((void**)&dev_odata, n * sizeof(int));
+            cudaMalloc((void**)&dev_boolean_map, n * sizeof(int));
+            cudaMalloc((void**)&dev_indices, n * sizeof(int));
+            cudaDeviceSynchronize();
+
+            // copy original data to GPU
+            cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+            // pad out rest of the expanded length with 0s
+            cudaMemset(dev_idata + (old_n), 0, sizeof(int) * (n - old_n));
+
+            cudaDeviceSynchronize();
+
+            timer().startGpuTimer();
+
+            int numBlocks = (n + blockSize - 1) / blockSize;
+
+            int num_iterations = ilog2ceil(n);
+
+            ////////////////////////////////////////////
+            // MAP TO BOOLEAN
+
+            StreamCompaction::Common::kernMapToBoolean << < numBlocks, blockSize >> > (n, dev_boolean_map, dev_idata);
+            cudaDeviceSynchronize();
+
+            ////////////////////////////////////////////
+
+            // copy data from boolean map to index array for scan
+            cudaMemcpy(dev_indices, dev_boolean_map, sizeof(int) * n, cudaMemcpyDeviceToDevice);
+            cudaDeviceSynchronize();
+            // pad out rest of the expanded length with 0s
+            cudaMemset(dev_indices + (old_n), 0, sizeof(int) * (n - old_n));
+
+            ////////////////////////////////////////////
+            // SCAN
+
+            // UPSWEEP
+            for (int d = 0; d <= num_iterations - 1; ++d) {
+                // compute upsweep iteration d
+                kern_ComputeEfficientExclusiveScanUpSweepIteration << < numBlocks, blockSize >> > (n, pow(2, d), pow(2, d + 1), dev_indices);
+                cudaDeviceSynchronize();
+            }
+
+            // set last element in array to 0
+            cudaMemset(dev_indices + n - 1, 0, sizeof(int));
+            cudaDeviceSynchronize();
+
+            // DOWNSWEEP
+            for (int d = num_iterations - 1; d >= 0; --d) {
+                // compute downsweep iteration d
+                kern_ComputeEfficientExclusiveScanDownSweepIteration << < numBlocks, blockSize >> > (n, pow(2, d), pow(2, d + 1), dev_indices);
+                cudaDeviceSynchronize();
+            }
+
+            ////////////////////////////////////////////
+
+            ////////////////////////////////////////////
+            // SCATTER
+
+            StreamCompaction::Common::kernScatter << < numBlocks, blockSize >> > (n, dev_odata, dev_idata, dev_boolean_map, dev_indices);
+            cudaDeviceSynchronize();
+
+            ////////////////////////////////////////////
+
+
+
+            timer().endGpuTimer();
+
+            // copy over last index value from gpu index array to local int to return
+            int last_index_val = 0;
+            int last_bool_val = 0;
+            cudaMemcpy(&last_index_val, dev_indices + old_n - 1, sizeof(int), cudaMemcpyDeviceToHost);
+            cudaMemcpy(&last_bool_val, dev_boolean_map + old_n - 1, sizeof(int), cudaMemcpyDeviceToHost);
+            cudaDeviceSynchronize();
+
+            int num_elements = last_index_val;
+            if (last_bool_val == 1) {
+                ++num_elements;
+            }
+
+            // copy over array from device to output array on cpu, copying only original length data
+            cudaMemcpy(odata, dev_odata, sizeof(int) * old_n, cudaMemcpyDeviceToHost);
+            cudaDeviceSynchronize();
+
+            cudaFree(dev_idata);
+            cudaFree(dev_odata);
+            cudaFree(dev_boolean_map);
+            cudaFree(dev_indices);
+            cudaDeviceSynchronize();
+            return num_elements;
+        }
+
+        //
+        // RADIX SORT NOT FULLY IMPLEMENTED
+        //
+
+        /**
+         * Performs radix sort idata, storing the result into odata.
+         *
+         * @param n      The number of elements in idata.
+         * @param odata  The array into which to store elements.
+         * @param idata  The array of elements to sort.
+         */
+        void radixSort(int n, int* odata, const int* idata) {
+            int* dev_i;
+            int* dev_b;
+            int* dev_e;
+            int* dev_f;
+            int* dev_t;
+            int* dev_d;
+
+            // need to pad out original array to nearest power of two size
+            //int old_n = n;
+            //n = pow(2, ilog2ceil(n));
+
+            cudaMalloc((void**)&dev_i, n * sizeof(int));
+            cudaMalloc((void**)&dev_b, n * sizeof(int));
+            cudaMalloc((void**)&dev_e, n * sizeof(int));
+            cudaMalloc((void**)&dev_f, n * sizeof(int));
+            cudaMalloc((void**)&dev_t, n * sizeof(int));
+            cudaMalloc((void**)&dev_d, n * sizeof(int));
+            cudaDeviceSynchronize();
+
+            // copy original data to GPU
+            cudaMemcpy(dev_i, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+            cudaDeviceSynchronize();
+
             timer().startGpuTimer();
-            // TODO
+
+            int numBlocks = (n + blockSize - 1) / blockSize;
+
+            for (int b = 0; b < sizeof(int) * 8; ++b) {
+                ////////////////////////////////////////////
+                // MAP TO BOOLEAN
+                StreamCompaction::Common::kernMapToBooleanBitwiseCheck << < numBlocks, blockSize >> > (n, 1 << b, dev_e, dev_i);
+                cudaDeviceSynchronize();
+
+                ////////////////////////////////////////////
+                /*
+                // copy data from boolean map to index array for scan
+                cudaMemcpy(dev_f, dev_e, sizeof(int) * n, cudaMemcpyDeviceToDevice);
+                cudaDeviceSynchronize();
+                // pad out rest of the expanded length with 0s
+                cudaMemset(dev_f + (old_n), 0, sizeof(int) * (n - old_n));
+
+                ////////////////////////////////////////////
+                // SCAN
+
+                // UPSWEEP
+                for (int d = 0; d <= num_iterations - 1; ++d) {
+                    // compute upsweep iteration d
+                    kern_ComputeEfficientExclusiveScanUpSweepIteration << < numBlocks, blockSize >> > (n, pow(2, d), pow(2, d + 1), dev_indices);
+                    cudaDeviceSynchronize();
+                }
+
+                // set last element in array to 0
+                cudaMemset(dev_indices + n - 1, 0, sizeof(int));
+                cudaDeviceSynchronize();
+
+                // DOWNSWEEP
+                for (int d = num_iterations - 1; d >= 0; --d) {
+                    // compute downsweep iteration d
+                    kern_ComputeEfficientExclusiveScanDownSweepIteration << < numBlocks, blockSize >> > (n, pow(2, d), pow(2, d + 1), dev_indices);
+                    cudaDeviceSynchronize();
+                }
+
+                ////////////////////////////////////////////
+                */
+            }
+
             timer().endGpuTimer();
-            return -1;
+
+            // copy over array from device to output array on cpu, copying only original length data
+            cudaMemcpy(odata, dev_e, sizeof(int) * n, cudaMemcpyDeviceToHost);
+            cudaDeviceSynchronize();
+
+            cudaFree(dev_i);
+            cudaFree(dev_b);
+            cudaFree(dev_e);
+            cudaFree(dev_f);
+            cudaFree(dev_t);
+            cudaFree(dev_d);
+            cudaDeviceSynchronize();
         }
     }
 }
diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h
index 803cb4f..a5fed4c 100644
--- a/stream_compaction/efficient.h
+++ b/stream_compaction/efficient.h
@@ -8,6 +8,12 @@ namespace StreamCompaction {
 
         void scan(int n, int *odata, const int *idata);
 
+        void scan_optimized(int n, int* odata, const int* idata);
+
+        void scan_sharedMem(int n, int* odata, const int* idata);
+
         int compact(int n, int *odata, const int *idata);
+
+        void radixSort(int n, int* odata, const int* idata);
     }
 }
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 4308876..e4a7b20 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -11,15 +11,109 @@ namespace StreamCompaction {
             static PerformanceTimer timer;
             return timer;
         }
-        // TODO: __global__
+
+        __global__ void kern_ComputeExclusiveScanIteration(int N, int offset, int* odata, int* idata) {
+            int thread_num = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (thread_num >= N) {
+                return;
+            }
+
+            // warp 0 will have divergent branches on offest == 1 but that would happen if moved to seperate kernel anyways
+            if (offset == 1 && thread_num == 0) {
+                // SPECIAL CASE: first iteration and 0th element should be zeroed-out for exclusive scan
+                odata[thread_num] = 0;
+            }
+            else if (offset == 1 && thread_num == 1) {
+                // SPECIAL CASE: first iteration and 1st element should be equal to the original 0th element for exclusive scan
+                odata[thread_num] = idata[thread_num - 1];
+            }
+            else if (offset == 1 && thread_num > 1) {
+                // SPECIAL CASE: first iteration elements past index 1 should use two previous values for summing
+                odata[thread_num] = idata[thread_num - 2] + idata[thread_num - 1];
+            }
+            else if (thread_num < offset) {
+                // same as for inclusive
+                odata[thread_num] = idata[thread_num];
+            }
+            else {
+                // same as for inclusive
+                odata[thread_num] = idata[thread_num - offset] + idata[thread_num];
+            }
+        }
+
+        __global__ void kern_ComputeInclusiveScanIteration(int N, int offset, int* odata, int* idata) {
+            int thread_num = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (thread_num >= N) {
+                return;
+            }
+            
+
+            if (thread_num < offset) {
+                odata[thread_num] = idata[thread_num];
+            }
+            else {
+                odata[thread_num] = idata[thread_num - offset] + idata[thread_num];
+            }
+        }
+
+        __global__ void kern_InclusiveToExclusiveScan(int N, int* odata, int* idata) {
+            int thread_num = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (thread_num >= N) {
+                return;
+            }
+
+            if (thread_num == 0) {
+                odata[thread_num] = 0;
+            }
+            else {
+                odata[thread_num] = idata[thread_num - 1];
+            }
+        }
 
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+            int* dev_odata_1;
+            int* dev_odata_2;
+
+            cudaMalloc((void**)&dev_odata_1, n * sizeof(int));
+            cudaMalloc((void**)&dev_odata_2, n * sizeof(int));
+            cudaDeviceSynchronize();
+
+            cudaMemcpy(dev_odata_1, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+            cudaDeviceSynchronize();
+
             timer().startGpuTimer();
-            // TODO
+
+            int numBlocks = (n + blockSize - 1) / blockSize;
+
+            int num_iterations = ilog2ceil(n);
+            for (int d = 1; d <= num_iterations; ++d) {
+                // compute scan iteration d
+#ifdef inclusiveToExclusive
+                kern_ComputeInclusiveScanIteration <<< numBlocks, blockSize >>> (n, pow(2, d - 1), dev_odata_2, dev_odata_1);
+#else
+                kern_ComputeExclusiveScanIteration <<< numBlocks, blockSize >>> (n, pow(2, d - 1), dev_odata_2, dev_odata_1);
+#endif
+                cudaDeviceSynchronize();
+                int* temp = dev_odata_1;
+                dev_odata_1 = dev_odata_2;
+                dev_odata_2 = temp;
+            }
+
+#ifdef inclusiveToExclusive
+            kern_InclusiveToExclusiveScan <<< numBlocks, blockSize >>> (n, dev_odata_2, dev_odata_1);
+            cudaDeviceSynchronize();
+#endif
             timer().endGpuTimer();
+
+            cudaMemcpy(odata, dev_odata_1, sizeof(int) * n, cudaMemcpyDeviceToHost);
+            cudaDeviceSynchronize();
+
+            cudaFree(dev_odata_1);
+            cudaFree(dev_odata_2);
+            cudaDeviceSynchronize();
         }
     }
 }
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index 1def45e..2f5d8a5 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -18,11 +18,16 @@ namespace StreamCompaction {
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+            thrust::device_vector<int> thrust_dv_idata(idata, idata + n);
+            thrust::device_vector<int> thrust_dv_odata(odata, odata + n);
+
             timer().startGpuTimer();
-            // TODO use `thrust::exclusive_scan`
-            // example: for device_vectors dv_in and dv_out:
-            // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+
+            thrust::exclusive_scan(thrust_dv_idata.begin(), thrust_dv_idata.end(), thrust_dv_odata.begin());
+
             timer().endGpuTimer();
+
+            cudaMemcpy(odata, (thrust_dv_odata.data()).get(), sizeof(int) * n, cudaMemcpyDeviceToHost);
         }
     }
 }