diff --git a/README.md b/README.md index 0e38ddb..89518f8 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,392 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Nick Moon + * [LinkedIn](https://www.linkedin.com/in/nick-moon1/), [personal website](https://nicholasmoon.github.io/) +* Tested on: Windows 10, AMD Ryzen 9 5900HS @ 3.0GHz 32GB, NVIDIA RTX 3060 Laptop 6GB (Personal Laptop) -### (TODO: Your README) +**This is a Stream Compaction algorithm implementation in C++ using CUDA for GPU acceleration. This allows for +compacting arrays with millions of elements in parallel on the GPU.** -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +### Results + +*Testbench functional and performance results for scan and stream compaction on input array size of 2^25 (33554432):* +``` +number of elements in input array for power of 2 tests = 33554432 +number of elements in input array for non-power of 2 tests = 33554429 + +**************** +** SCAN TESTS ** +**************** + [ 34 41 19 15 6 8 5 37 47 15 4 40 20 ... 44 16 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 19.1805ms (std::chrono Measured) + [ 0 34 75 94 109 115 123 128 165 212 227 231 271 ... 821815873 821815917 821815933 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 19.9557ms (std::chrono Measured) + [ 0 34 75 94 109 115 123 128 165 212 227 231 271 ... 821815825 821815842 821815853 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 29.1328ms (CUDA Measured) + [ 0 34 75 94 109 115 123 128 165 212 227 231 271 ... 821815873 821815917 821815933 ] + passed +==== naive scan, non-power-of-two ==== + elapsed time: 29.0673ms (CUDA Measured) + [ 0 34 75 94 109 115 123 128 165 212 227 231 271 ... 821815825 821815842 821815853 ] + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 24.446ms (CUDA Measured) + [ 0 34 75 94 109 115 123 128 165 212 227 231 271 ... 821815873 821815917 821815933 ] + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 21.2756ms (CUDA Measured) + [ 0 34 75 94 109 115 123 128 165 212 227 231 271 ... 821815825 821815842 821815853 ] + passed +==== thrust scan, power-of-two ==== + elapsed time: 1.16003ms (CUDA Measured) + [ 0 34 75 94 109 115 123 128 165 212 227 231 271 ... 821815873 821815917 821815933 ] + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 1.17952ms (CUDA Measured) + [ 0 34 75 94 109 115 123 128 165 212 227 231 271 ... 821815825 821815842 821815853 ] + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 3 3 0 0 1 1 0 3 1 0 1 1 ... 2 0 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 51.9551ms (std::chrono Measured) + [ 2 3 3 1 1 3 1 1 1 3 2 2 2 ... 2 3 2 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 51.9623ms (std::chrono Measured) + [ 2 3 3 1 1 3 1 1 1 3 2 2 2 ... 3 2 3 ] + passed +==== cpu compact with scan ==== + elapsed time: 103.018ms (std::chrono Measured) + [ 2 3 3 1 1 3 1 1 1 3 2 2 2 ... 2 3 2 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 25.3891ms (CUDA Measured) + [ 2 3 3 1 1 3 1 1 1 3 2 2 2 ... 2 3 2 ] + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 25.2508ms (CUDA Measured) + [ 2 3 3 1 1 3 1 1 1 3 2 2 2 ... 3 2 3 ] + passed +``` + + +## Implementation + +### Array Scan + +The array scan algorithm is an algorithm that, given an input array of ints ```idata``` of size n, +will output an an array ```odata``` with size also n, where the value at each +index ```i``` in ```odata``` is the sum of all previous ```i-1``` elements from ```idata```. +Exclusive scan is the version implemented in this repository, as opposed to Inclusive. This means +setting ```odata[0] = 0``` and each element in ```odata``` not including the addition of +its correpsonding value in ```idata``` when computing the value. Below each of the different +implementations of the exclusive scan algorithm are talked in more detail. + +#### CPU Implementation + +The CPU implementation implements the array scan algorithm in a serial format, according to the pseudocode: + +``` +function scan_cpu(input_array, output_array, number_of_elements): + + output_array[0] = 0 + for i in range [1, number_of_elements): + output_array[i] = output_array[i - 1] + input_array[i - 1] + +end +``` + +#### Naive CUDA Implemenation + +The Naive CUDA implementation maps the original cpu scan approach and maps it to parallel GPU hardware. +This necessitates the use of double-buffering in order to avoid race conditions. This means swapping the +two gpu arrays that are used as input and output each iteration. + +``` +function scan_gpu(input_array, output_array, number_of_elements): + + let o_array_gpu_0 = input_array (this is an array on the GPU) + let o_array_gpu_1 = o_array_gpu_0 (this is an array on the GPU) + + for all k in parallel: + shift_array_right(k, o_array_gpu_1, o_array_gpu_0) + swap(o_array_gpu_0, o_array_gpu_1) + + for d in range [1, ceil( log_2(n) ) ]: + for all k in parallel: + naive_scan_iteration(k, o_array_gpu_0, o_array_gpu_1, 2^d) + swap(o_array_gpu_0, o_array_gpu_1) + + output_array = o_array_gpu_0 + +end + +kernel shift_array_right(thread_ID, input_array, output_array): + if thread_ID_ == 0 then: + output_array[0] = 0 + else: + output_array[thread_ID] = input_array[thread_ID - 1] + +end + +kernel naive_scan_iteration(thread_ID, input_array, output_array, offset): + if (thread_ID < offset) then: + output_array[thread_ID] = input_array[thread_ID] + else: + output_array[thread_ID] = input_array[thread_ID - offset] + input_array[thread_ID] + +end +``` + +#### Work-Efficient CUDA Implementation + +To make the parallel approach more efficient, a different scheme is used. By treating the array +as a balanced binary tree, the new approach seperates out the algorithm into two steps: the +up-sweep and the down-sweep. The up-sweep is just a parallel reduction, which just stores the +sum of an array in the last index. A picture of the up-sweep on an example array is shown in Figure +1 below: + +![](images/figures/scan_upsweep.PNG) +*Figure 1: Demonstration of efficient scan up-sweep operation.* + +After the up-sweep, the element at the last index (the sum of the array) is zeroed out, and then +the down-sweep occurs, as seen in Figure 2 below: + +![](images/figures/scan_downsweep.PNG) +*Figure 2: Demonstration of efficient scan down-sweep operation.* + +#### Thrust Implementation + +For the thrust implementation, the input and output arrays are simply converted to thrust library +```device_vector```s and the thrust ```exclusive_scan()``` function is called with them. The thrust device vector +for the output data is then transferred back to the host array. + +### Stream Compaction + +Stream compaction is an array algorithm that, given an input array of ints ```idata``` of size ```n```, +returns an output array ```odata``` of some size ```[0,n]``` such that ```odata``` contains only the +values ```x``` in ```idata``` that satisfy some criteria function ```f(x)```. This is essentially +used to compact an array into a smaller size by getting rid of unneeded elements as determined by +the criteria function ```f(x)```. Values for which ```f(x)``` return ```true``` are kept, while +values for which ```f(x)``` return false are removed. + +#### CPU Implementation (without scan) + +The CPU implementation (without scan) implements the stream compaction algorithm in a serial format, +according to the pseudocode: + +``` +function stream_compact_cpu_no_scan(input_array, output_array, number_of_elements): + + let num_compacted_elements = 0 + for i in range [0, number_of_elements): + if input_array[i] != 0 then: + output_array[num_compacted_elements] = input_array[i] + num_compacted_elements = num_compacted_elements + 1 + + return num_elements + +end +``` + +This method keeps a rolling index while looping through the input array that keeps track of what index +into the output array to write to, only writing to that index when a value in the input array +satisfies the condition function, which in this case is ```x != 0```. + +#### CPU Implementation (with scan) + +The CPU implementation (with scan) implements the stream compaction algorithm in a "pseudo"-parallel format, +meaning that the structure of the algorithm mimics what will be used for the CUDA implementation, +but still operates on the array serially. This involves a (serial) pass to check for each element in +the input array that is not 0, running a (serial) scan on this data, and then running a (serial) scatter +on the scan result to build the final output array. The psuedocode is below: + +``` +function stream_compact_cpu_scan(input_array, output_array, number_of_elements): + + let num_compacted_elements = 0 + + // GENERATE CONDITION VALUES + + for i in range [0, number_of_elements): + if input_array[i] == 0 then: + output_array[i] = 0 + else: + output_array[i] = 1 + + // SCAN + + let temp_array[number_of_elements] + temp_array[0] = 0; + + for i in range [1, number_of_elements): + temp_array[i] = temp_array[i - 1] + output_array[i - 1] + + // SCATTER + + for i in range [0, number_of_elements): + if output_array[i] == 1 then: + output_array[temp_array[i]] = input_array[i]; + ++num_compacted_elements; + } + + return num_compacted_elements + +end +``` + +#### Efficient Parallel Implementation + +The GPU version of this algorithm directly models the CPU with scan version, except this time the 3 +steps, the condition check, the scan, and the scatter, are all performed on the GPU. The scan +function is the same as for the efficient parallel scan from the previous section, while the pseudocode +for the other two functions are shown below: + +``` + +kernel map_to_boolean(thread_ID, input_array, output_array): + + if input_array[thread_ID] == 0 then: + output_array[thread_ID] = 0 + else: + output_array[thread_ID] = 1 + +end + +kernel scatter(thread_ID, input_array, output_array, boolean_array, indices_array): + + if boolean_array[thread_ID] == 1 then: + output_data[indices_array[thread_ID]] = input_array[thread_ID] + +end +``` + +### Extra Credit + +I implemented the CPU radix sort but did not end up finishing my GPU version. + +## Testing Strategy + +**All tests and results are done on Release mode in Visual Studio** + +The first step in generating results was to figure out the optimal block size for the +different GPU implementations of the scan and stream compaction algorithms. Data from each +implementation was collected with a constant input array size of 2^25 (33,554,432) for +powers of two block sizes from 32 to 1024, and the results are shown in Figure 3 below. + +![](images/figures/graph_blocksize.png) +*Figure 3: Effect of CUDA block size on runtime of scan and stream compaction.* + +As seen from the graph, for both exclusive scan and stream compaction, the optimal block size is +around 128-256. Below these values, the runtime shoots up exponentially with decreasing block size, +getting as high as almost 4x worse performance at a block size of 32 compared to 256 for the parallel +stream compaction on an array with a size not a power of 2. +Similarly, the performance also increases with increasing block size past 256, at least for all the +algorithms save the naive scan. This growth appears roughly linear for the other algorithms, +while the runtime stays roughly constant for the naive scan. + +To be more specific, the block size with the lowest runtime and thus the **optimal block size** of +the **naive parallel scan is 128**, and for **all others is 256**. These values will thus be used to test +and retrieve results for the forthcoming topics of discussion. + +In the **Performance Analysis** section below, all data is from the ***Not Power of 2 Array Size*** tests. +This is because this is the most general test case for analyzing these algorithms, as most often an +array will not fit neatly into the CUDA blocks. Additionally, there is negligable difference in the +runtimes between the power of 2 and non-power of 2 tests. This can be seen in Figure 4 below: + +![](images/figures/graph_pow2.png) +*Figure 4: Demonstrating runtime comparison between tests on arrays with power of 2 size and not.* + +## Performance Analysis + +**All tests and results are done on Release mode in Visual Studio** + +### Scan + +As can be seen by Figure 5 below, the runtime of the scan algorithm increases roughly linearly for each of the +different implementations, but the slope of this increase is different for each one. + +The thrust version was by far the fastest in most cases and had the smallest slope. +In all test cases, the thrust version maintained a runtime below 3ms. This makes sense, as thrust is +a highly optimized GPU and parallel algorithm library developed for high performance applications. +As to its amazing performance relative the CUDA implementations found in this repository, +that will be discussed further down the readme. + +The next best version was the CPU scan. While this is initially suprising, there are a couple benefits +the CPU version has with respect to the GPU versions. First, the logic is very simple for computing +each element of the output, and the serial nature of the algorithm means it is very cache coherent. +It also requires no additional memory allocation (CPU or GPU), which all the parallel versions require. + +The work-efficient parallel scan is the next best version. This is slower than the CPU and thrust versions, +but faster than the naive parallel version in the long run. The reason this is slower than the CPU +version is, first of all, because it is just using global memory. This is the GPU memory with the highest +latency, and, especially when compared with the CPU version's cache coherency, is a huge bottleneck +on the algorithm. Another problem is that the number of threads remains constant each iteration of the +algorithm. This means that, during both the up sweep and down sweep, there are potentially millions of +threads allocated for a process (i.e. one of the last data points on the graph), while only 1 or a few +are actually active at a time. A more optimized version of the algorithm would allocate only the +threads that are needed. + +Finally, the naive parallel scan is the worst version. For 33 million array size, this algorithm +is 50% worse than the efficient parallel scan, and 30x slower than the thrust library implementation. +This is mostly due to a couple factors. First, the naive parallel algorithm never decreases the amount +of threads (number of blocks) launched for the kernels, which means at the final iteration, there are +millions of threads (for the final data point) being launched, and only half of them are actually doing +work. Another problem is that the number of active threads for each iteration is only being decreased +by a number that doubles each time. This means that, compared with the efficient version that halves +the number of active threads each iteration, the naive approach will only decrease the number by a +power of two. A final problem with this approach is that it requires double buffering, meaning +at least double the memory of the efficient version, along with the memory latency that comes with +that much additional memory usage. + +![](images/figures/graph_scan.png) +*Figure 5 Effect of input array size on runtime of scan algorithm.* + +##### Thrust NSight Analysis + +Taking a little deeper look into the Thrust version of Exclusive Scan, the NSight profiling timeline +seen in Figures 6 and 7 +reveal that it consists of at least 1 ```cudaEventCreate``` API call, as well as 3 ```cudaMemcpyAsync``` +calls. Two of these ```cudaMemcpyAsync``` calls are performed as part of the casting the input and output +arrays from C style arrays to Thrust ```device_vector```, and the third is from retrieving the output +data from the output ```device_vector``` and placing it into the output C style array. In between +the first two ```cudaMemcpyAsync``` calls and the third, are ```cudaMalloc```, +```cudaStreamSynchronize```, and ```cudaFree``` function calls. This is where the actual implementation +of the Thrust Exclusive Scan function is, where it allocates device memory, operates on the data, and then +frees it at the end. + +![](images/results/thrust_nsight_timeline_memcpy.PNG) +*Figure 6: NSight Timeline of Thrust Exclusive Scan operation highlighting cudaMemcpyAsync.* + +![](images/results/thrust_nsight_timeline_streamsync.PNG) +*Figure 7: NSight Timeline of Thrust Exclusive Scan operation highlighting cudaStreamSynchronize.* + +### Stream Compaction + +Likewise, Figure 8 below shows the runtime of the stream compaction algorithm also increases linearly +for each of the implementations, and again the slope of this increase is the only thing that changes. +However, unlike with scan, here the GPU stream compaction is significantly faster than either of the +CPU implementations for arrays with very large amounts of elements (>2000000). This is because +the CPU version (without scan), even though it is logically barely more complex than the CPU scan (which +was faster than the GPU scans), has a conditional in the for loop. Because the array the stream +compaction input array has elements which are not related in any way to each other, in the worst +case the array would have alternating elements that are ```0``` and not ```0```, causing the branch +prediction in the CPU to cause large performance penalties with increasing array sizes. By parallelizing +these conditionals, the GPU implementation of stream compaction has much faster runtime for array sizes +in the millions. + +![](images/figures/graph_compact.png) +*Figure 8: Effect of input array size on runtime of stream compaction algorithm* + +### Bloopers + +No bloopers for this assignment! diff --git a/images/figures/graph_blocksize.png b/images/figures/graph_blocksize.png new file mode 100644 index 0000000..4d3aa86 Binary files /dev/null and b/images/figures/graph_blocksize.png differ diff --git a/images/figures/graph_compact.png b/images/figures/graph_compact.png new file mode 100644 index 0000000..e9d22c8 Binary files /dev/null and b/images/figures/graph_compact.png differ diff --git a/images/figures/graph_pow2.png b/images/figures/graph_pow2.png new file mode 100644 index 0000000..c11ad8f Binary files /dev/null and b/images/figures/graph_pow2.png differ diff --git a/images/figures/graph_scan.png b/images/figures/graph_scan.png new file mode 100644 index 0000000..83b0336 Binary files /dev/null and b/images/figures/graph_scan.png differ diff --git a/images/figures/scan_downsweep.PNG b/images/figures/scan_downsweep.PNG new file mode 100644 index 0000000..04ac8ed Binary files /dev/null and b/images/figures/scan_downsweep.PNG differ diff --git a/images/figures/scan_upsweep.PNG b/images/figures/scan_upsweep.PNG new file mode 100644 index 0000000..60ee4e5 Binary files /dev/null and b/images/figures/scan_upsweep.PNG differ diff --git a/images/results/compact_15.PNG b/images/results/compact_15.PNG new file mode 100644 index 0000000..ac48df9 Binary files /dev/null and b/images/results/compact_15.PNG differ diff --git a/images/results/compact_20.PNG b/images/results/compact_20.PNG new file mode 100644 index 0000000..3fde06f Binary files /dev/null and b/images/results/compact_20.PNG differ diff --git a/images/results/compact_25.PNG b/images/results/compact_25.PNG new file mode 100644 index 0000000..4d7feb8 Binary files /dev/null and b/images/results/compact_25.PNG differ diff --git a/images/results/scan_15.PNG b/images/results/scan_15.PNG new file mode 100644 index 0000000..b51ec32 Binary files /dev/null and b/images/results/scan_15.PNG differ diff --git a/images/results/scan_20.PNG b/images/results/scan_20.PNG new file mode 100644 index 0000000..cc29ce8 Binary files /dev/null and b/images/results/scan_20.PNG differ diff --git a/images/results/scan_25.PNG b/images/results/scan_25.PNG new file mode 100644 index 0000000..3743639 Binary files /dev/null and b/images/results/scan_25.PNG differ diff --git a/images/results/thrust_nsight_timeline_memcpy.PNG b/images/results/thrust_nsight_timeline_memcpy.PNG new file mode 100644 index 0000000..a3ed5bd Binary files /dev/null and b/images/results/thrust_nsight_timeline_memcpy.PNG differ diff --git a/images/results/thrust_nsight_timeline_streamsync.PNG b/images/results/thrust_nsight_timeline_streamsync.PNG new file mode 100644 index 0000000..0e9b0d3 Binary files /dev/null and b/images/results/thrust_nsight_timeline_streamsync.PNG differ diff --git a/src/main.cpp b/src/main.cpp index 896ac2b..531eacf 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,11 +13,11 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 20; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two -int *a = new int[SIZE]; -int *b = new int[SIZE]; -int *c = new int[SIZE]; +int* a = new int[SIZE]; +int* b = new int[SIZE]; +int* c = new int[SIZE]; int main(int argc, char* argv[]) { // Scan tests @@ -51,7 +51,7 @@ int main(int argc, char* argv[]) { printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan @@ -64,35 +64,35 @@ int main(int argc, char* argv[]) { printDesc("naive scan, non-power-of-two"); StreamCompaction::Naive::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, power-of-two"); StreamCompaction::Efficient::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, non-power-of-two"); StreamCompaction::Efficient::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("thrust scan, power-of-two"); StreamCompaction::Thrust::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("thrust scan, non-power-of-two"); StreamCompaction::Thrust::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); printf("\n"); @@ -137,16 +137,48 @@ int main(int argc, char* argv[]) { printDesc("work-efficient compact, power-of-two"); count = StreamCompaction::Efficient::compact(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); zeroArray(SIZE, c); printDesc("work-efficient compact, non-power-of-two"); count = StreamCompaction::Efficient::compact(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); + + // RADIX SORT ON GPU NOT IMPLEMENTED + /* + printf("\n"); + printf("*****************************\n"); + printf("** RADIX SORT TESTS **\n"); + printf("*****************************\n"); + + genArray(SIZE - 1, a, 100); // Leave a 0 at the end to test that edge case + a[SIZE - 1] = 0; + printArray(SIZE, a, true); + + zeroArray(SIZE, b); + printDesc("cpu std::stable_sort, power-of-two"); + StreamCompaction::CPU::stdSort(SIZE, b, a); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + printArray(SIZE, b, true); + + zeroArray(SIZE, c); + printDesc("cpu radix sort, power-of-two"); + StreamCompaction::CPU::radixSort(SIZE, c, a); + printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); + //printArray(SIZE, c, true); + printCmpResult(SIZE, b, c); + + zeroArray(SIZE, c); + printDesc("gpu radix sort, power-of-two"); + StreamCompaction::Efficient::radixSort(SIZE, c, a); + printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(SIZE, c, true); + printCmpResult(SIZE, b, c);*/ + system("pause"); // stop Win32 console from closing on exit delete[] a; delete[] b; diff --git a/src/testing_helpers.hpp b/src/testing_helpers.hpp index 025e94a..021ba09 100644 --- a/src/testing_helpers.hpp +++ b/src/testing_helpers.hpp @@ -61,7 +61,7 @@ void printArray(int n, int *a, bool abridged = false) { printf(" [ "); for (int i = 0; i < n; i++) { if (abridged && i + 2 == 15 && n > 16) { - i = n - 2; + i = n - 3; printf("... "); } printf("%3d ", a[i]); diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d63..232b38d 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -23,7 +23,18 @@ namespace StreamCompaction { * which map to 0 will be removed, and elements which map to 1 will be kept. */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO + + int thread_num = threadIdx.x + (blockIdx.x * blockDim.x); + if (thread_num >= n) { + return; + } + + if (idata[thread_num] == 0) { + bools[thread_num] = 0; + } + else { + bools[thread_num] = 1; + } } /** @@ -32,8 +43,44 @@ namespace StreamCompaction { */ __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { - // TODO + + int thread_num = threadIdx.x + (blockIdx.x * blockDim.x); + if (thread_num >= n) { + return; + } + + if (bools[thread_num] == 1) { + odata[indices[thread_num]] = idata[thread_num]; + } } + + /** + * Maps an array to an array of 0s and 1s for radix sort. Elements + * whose value in bit b is 1 are mapped to 1, otherwise 0 + */ + __global__ void kernMapToBooleanBitwiseCheck(int n, int c, int* bools, const int* idata) { + int thread_num = threadIdx.x + (blockIdx.x * blockDim.x); + if (thread_num >= n) { + return; + } + + if (idata[thread_num] & c) { + bools[thread_num] = 0; + } + else { + bools[thread_num] = 1; + } + } + + + __global__ void kernReverseArray(int n, int* odata_reversed, const int* odata) { + int thread_num = threadIdx.x + (blockIdx.x * blockDim.x); + if (thread_num >= n) { + return; + } + + odata_reversed[thread_num] = odata[n - thread_num - 1]; + } } } diff --git a/stream_compaction/common.h b/stream_compaction/common.h index d2c1fed..c2b9c77 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -9,10 +9,16 @@ #include #include #include +#include +#include #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) +#define blockSize 128 + +//#define inclusiveToExclusive + /** * Check for CUDA errors; print and exit if there was a problem. */ @@ -37,6 +43,10 @@ namespace StreamCompaction { __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices); + __global__ void kernMapToBooleanBitwiseCheck(int n, int c, int* bools, const int* idata); + + __global__ void kernReverseArray(int n, int* odata_reversed, const int* odata); + /** * This class is used for timing the performance * Uncopyable and unmovable diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa11..c334ae4 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -19,7 +19,10 @@ namespace StreamCompaction { */ void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + odata[0] = 0; + for (int i = 1; i < n; ++i) { + odata[i] = odata[i - 1] + idata[i - 1]; + } timer().endCpuTimer(); } @@ -30,9 +33,15 @@ namespace StreamCompaction { */ int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + int num_elements = 0; + for (int i = 0; i < n; ++i) { + if (idata[i] != 0) { + odata[num_elements] = idata[i]; + ++num_elements; + } + } timer().endCpuTimer(); - return -1; + return num_elements; } /** @@ -41,10 +50,139 @@ namespace StreamCompaction { * @returns the number of elements remaining after compaction. */ int compactWithScan(int n, int *odata, const int *idata) { + int* temp_array = new int[n]; + timer().startCpuTimer(); - // TODO + + int num_elements = 0; + // GENERATE CONDITION VALUES + for (int i = 0; i < n; ++i) { + if (idata[i] == 0) { + odata[i] = 0; + } + else { + odata[i] = 1; + } + } + + // SCAN + temp_array[0] = 0; + for (int i = 1; i < n; ++i) { + temp_array[i] = temp_array[i - 1] + odata[i - 1]; + } + + // SCATTER + for (int i = 0; i < n; ++i) { + if (odata[i] == 1) { + odata[temp_array[i]] = idata[i]; + ++num_elements; + } + } + timer().endCpuTimer(); - return -1; + + delete[] temp_array; + + return num_elements; + } + + /** + * CPU radix sort implementation + * + * @param n The number of elements in idata. + * @param odata The array into which to store elements. + * @param idata The array of elements to sort. + */ + void radixSort(int n, int* odata, const int* idata) { + // not the greatest cpu implementation of radix sort (quite memory intensive), but functional + + int* temp_array_0 = new int[n * 2]; + int* temp_array_1 = new int[n * 2]; + + for (int i = 0; i < n; ++i) { + temp_array_0[i] = idata[i]; + } + + int index_left_partition = 0; + int index_right_partition = 0; + int num_left_partition = n; + int num_right_partition = 0; + + timer().startCpuTimer(); + + for (int b = 0; b < sizeof(int) * 8; ++b) { + // handle left partition + for (int i = 0; i < num_left_partition; ++i) { + if (!(temp_array_0[i] & (1 << b))) { + temp_array_1[index_left_partition] = temp_array_0[i]; + index_left_partition++; + } + else { + temp_array_1[index_right_partition + n] = temp_array_0[i]; + index_right_partition++; + } + } + + //handle right partition + for (int i = n; i < n + num_right_partition; ++i) { + if (!(temp_array_0[i] & (1 << b))) { + temp_array_1[index_left_partition] = temp_array_0[i]; + index_left_partition++; + } + else { + temp_array_1[index_right_partition + n] = temp_array_0[i]; + index_right_partition++; + } + } + + int* temp = temp_array_0; + temp_array_0 = temp_array_1; + temp_array_1 = temp; + + num_left_partition = index_left_partition; + num_right_partition = index_right_partition; + index_left_partition = 0; + index_right_partition = 0; + } + + + + timer().endCpuTimer(); + + // handle left partition + int odata_index = 0; + for (int i = 0; i < num_left_partition; ++i) { + odata[odata_index] = temp_array_0[i]; + odata_index++; + } + + //handle right partition + for (int i = n; i < n + num_right_partition; ++i) { + odata[odata_index] = temp_array_0[i]; + odata_index++; + } + + delete[] temp_array_0; + delete[] temp_array_1; + } + + /** + * CPU radix sort implementation using std::stable_sort + * + * @param n The number of elements in idata. + * @param odata The array into which to store elements. + * @param idata The array of elements to sort. + */ + void stdSort(int n, int* odata, const int* idata) { + std::vector vect_idata(idata, idata + n); + + timer().startCpuTimer(); + std::stable_sort(vect_idata.begin(), vect_idata.end()); + timer().endCpuTimer(); + + for (int i = 0; i < n; ++i) { + odata[i] = vect_idata[i]; + } } } } diff --git a/stream_compaction/cpu.h b/stream_compaction/cpu.h index 873c047..f1eb1a4 100644 --- a/stream_compaction/cpu.h +++ b/stream_compaction/cpu.h @@ -11,5 +11,9 @@ namespace StreamCompaction { int compactWithoutScan(int n, int *odata, const int *idata); int compactWithScan(int n, int *odata, const int *idata); + + void radixSort(int n, int* odata, const int* idata); + + void stdSort(int n, int* odata, const int* idata); } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346e..7b822ef 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -12,15 +12,202 @@ namespace StreamCompaction { return timer; } + __global__ void kern_ComputeEfficientExclusiveScanUpSweepIteration(int N, int offset_d, int offset_d_plus_1, int* odata) { + int thread_num = threadIdx.x + (blockIdx.x * blockDim.x); + if (thread_num >= N) { + return; + } + + if (thread_num % offset_d_plus_1 == 0) { + odata[thread_num + offset_d_plus_1 - 1] += odata[thread_num + offset_d - 1]; + } + + } + + __global__ void kern_ComputeEfficientExclusiveScanDownSweepIteration(int N, int offset_d, int offset_d_plus_1, int* odata) { + int thread_num = threadIdx.x + (blockIdx.x * blockDim.x); + if (thread_num >= N) { + return; + } + + if (thread_num % offset_d_plus_1 == 0) { + int t = odata[thread_num + offset_d - 1]; + odata[thread_num + offset_d - 1] = odata[thread_num + offset_d_plus_1 - 1]; + odata[thread_num + offset_d_plus_1 - 1] += t; + } + + } + + __global__ void kern_ComputeEfficientExclusiveScanUpSweepIteration_optimized(int N, int N_over_2, int num_active_threads, int offset_d, int offset_d_plus_1, int* odata) { + int thread_num = threadIdx.x + (blockIdx.x * blockDim.x); + if (thread_num >= num_active_threads) { + return; + } + + int reversed_thread_num = (N - thread_num) - 1; + int reversed_offset = (N - offset_d) - 1; + + if (reversed_thread_num > reversed_offset) { + odata[reversed_thread_num] += odata[reversed_thread_num - offset_d]; + } + } + + __global__ void kern_ComputeEfficientExclusiveScanDownSweepIteration_optimized(int N, int num_active_threads, int offset_d, int offset_d_plus_1, int* odata) { + int thread_num = threadIdx.x + (blockIdx.x * blockDim.x); + if (thread_num >= N) { + return; + } + + int reversed_thread_num = (N - thread_num) - 1; + int reversed_offset = (N - offset_d) - 1; + + if (reversed_thread_num > reversed_offset) { + int t = odata[reversed_thread_num - offset_d]; + odata[reversed_thread_num - offset_d] = odata[reversed_thread_num]; + odata[reversed_thread_num] += t; + } + + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ - void scan(int n, int *odata, const int *idata) { + void scan(int n, int* odata, const int* idata) { + int* dev_odata; + + // need to pad out original array to nearest power of two size + int old_n = n; + n = pow(2, ilog2ceil(n)); + + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + cudaDeviceSynchronize(); + + // copy original data to GPU + cudaMemcpy(dev_odata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + // pad out rest of the expanded length with 0s + cudaMemset(dev_odata + (old_n), 0, sizeof(int) * (n - old_n)); + cudaDeviceSynchronize(); + timer().startGpuTimer(); - // TODO + + int numBlocks = (n + blockSize - 1) / blockSize; + + int num_iterations = ilog2ceil(n); + + // UPSWEEP + for (int d = 0; d <= num_iterations - 1; ++d) { + // compute upsweep iteration d + kern_ComputeEfficientExclusiveScanUpSweepIteration << < numBlocks, blockSize >> > (n, pow(2, d), pow(2, d + 1), dev_odata); + cudaDeviceSynchronize(); + } + + // set last element in array to 0 + cudaMemset(dev_odata + n - 1, 0, sizeof(int)); + cudaDeviceSynchronize(); + + // DOWNSWEEP + for (int d = num_iterations - 1; d >= 0; --d) { + // compute downsweep iteration d + kern_ComputeEfficientExclusiveScanDownSweepIteration << < numBlocks, blockSize >> > (n, pow(2, d), pow(2, d + 1), dev_odata); + cudaDeviceSynchronize(); + } + timer().endGpuTimer(); + + // copy over array from device to output array on cpu, copying only original length data + cudaMemcpy(odata, dev_odata, sizeof(int) * old_n, cudaMemcpyDeviceToHost); + cudaDeviceSynchronize(); + + cudaFree(dev_odata); + cudaDeviceSynchronize(); } + + /** + * Performs prefix-sum (aka scan) on idata, storing the result into odata. + * + * This is a version of Efficient::Scan that is optimized as part of EXTRA CREDIT in + * Part 5: Why is My GPU Approach So Slow? + */ + void scan_optimized(int n, int* odata, const int* idata) { + /*int* dev_odata; + + // need to pad out original array to nearest power of two size + int old_n = n; + n = pow(2, ilog2ceil(n)); + + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + cudaDeviceSynchronize(); + + // copy original data to GPU + cudaMemcpy(dev_odata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + // pad out rest of the expanded length with 0s + cudaMemset(dev_odata + (old_n), 0, sizeof(int) * (n - old_n)); + cudaDeviceSynchronize(); + + timer().startGpuTimer(); + + int numBlocks = (n + blockSize - 1) / blockSize; + + int num_iterations = ilog2ceil(n); + + int num_active_threads = n; + + // UPSWEEP + for (int d = num_iterations; d > 0; --d) { + // compute upsweep iteration d + num_active_threads /= 2; + numBlocks = (num_active_threads + blockSize - 1) / blockSize; + //std::cout << num_active_threads << std::endl; + int reversed_thread_num = (n - 0) - 1; + int reversed_offset = (n - num_active_threads) - 1; + //std::cout << "fe: " << reversed_thread_num << " " << num_active_threads << " used_index: " << reversed_thread_num - num_active_threads << std::endl; + kern_ComputeEfficientExclusiveScanUpSweepIteration_optimized << < numBlocks, blockSize >> > (n, n / 2, num_active_threads, num_active_threads, pow(2, d + 1), dev_odata); + cudaDeviceSynchronize(); + } + + // set last element in array to 0 + cudaMemset(dev_odata + n - 1, 0, sizeof(int)); + cudaDeviceSynchronize(); + + numBlocks = (n + blockSize - 1) / blockSize; + + num_active_threads = 1; + + // DOWNSWEEP + for (int d = 0; d < num_iterations; ++d) { + // compute downsweep iteration d + + numBlocks = (num_active_threads + blockSize - 1) / blockSize; + kern_ComputeEfficientExclusiveScanDownSweepIteration_optimized <<< numBlocks, blockSize >>> (n, num_active_threads, num_active_threads, pow(2, d + 1), dev_odata); + cudaDeviceSynchronize(); + num_active_threads *= 2; + } + + timer().endGpuTimer(); + + // copy over array from device to output array on cpu, copying only original length data + cudaMemcpy(odata, dev_odata, sizeof(int) * old_n, cudaMemcpyDeviceToHost); + cudaDeviceSynchronize(); + + cudaFree(dev_odata); + cudaDeviceSynchronize();*/ + } + + + /** + * Performs prefix-sum (aka scan) on idata, storing the result into odata. + * + * This is a version of Efficient::Scan that is optimized using shared memory as part of EXTRA CREDIT in + * Part 6: GPU Scan Using Shared Memory && Hardware Optimization + */ + void scan_sharedMem(int n, int* odata, const int* idata) { + + } + + + + /** * Performs stream compaction on idata, storing the result into odata. * All zeroes are discarded. @@ -30,11 +217,199 @@ namespace StreamCompaction { * @param idata The array of elements to compact. * @returns The number of elements remaining after compaction. */ - int compact(int n, int *odata, const int *idata) { + int compact(int n, int* odata, const int* idata) { + int* dev_idata; + int* dev_odata; + int* dev_boolean_map; + int* dev_indices; + + // need to pad out original array to nearest power of two size + int old_n = n; + n = pow(2, ilog2ceil(n)); + + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + cudaMalloc((void**)&dev_boolean_map, n * sizeof(int)); + cudaMalloc((void**)&dev_indices, n * sizeof(int)); + cudaDeviceSynchronize(); + + // copy original data to GPU + cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + // pad out rest of the expanded length with 0s + cudaMemset(dev_idata + (old_n), 0, sizeof(int) * (n - old_n)); + + cudaDeviceSynchronize(); + + timer().startGpuTimer(); + + int numBlocks = (n + blockSize - 1) / blockSize; + + int num_iterations = ilog2ceil(n); + + //////////////////////////////////////////// + // MAP TO BOOLEAN + + StreamCompaction::Common::kernMapToBoolean << < numBlocks, blockSize >> > (n, dev_boolean_map, dev_idata); + cudaDeviceSynchronize(); + + //////////////////////////////////////////// + + // copy data from boolean map to index array for scan + cudaMemcpy(dev_indices, dev_boolean_map, sizeof(int) * n, cudaMemcpyDeviceToDevice); + cudaDeviceSynchronize(); + // pad out rest of the expanded length with 0s + cudaMemset(dev_indices + (old_n), 0, sizeof(int) * (n - old_n)); + + //////////////////////////////////////////// + // SCAN + + // UPSWEEP + for (int d = 0; d <= num_iterations - 1; ++d) { + // compute upsweep iteration d + kern_ComputeEfficientExclusiveScanUpSweepIteration << < numBlocks, blockSize >> > (n, pow(2, d), pow(2, d + 1), dev_indices); + cudaDeviceSynchronize(); + } + + // set last element in array to 0 + cudaMemset(dev_indices + n - 1, 0, sizeof(int)); + cudaDeviceSynchronize(); + + // DOWNSWEEP + for (int d = num_iterations - 1; d >= 0; --d) { + // compute downsweep iteration d + kern_ComputeEfficientExclusiveScanDownSweepIteration << < numBlocks, blockSize >> > (n, pow(2, d), pow(2, d + 1), dev_indices); + cudaDeviceSynchronize(); + } + + //////////////////////////////////////////// + + //////////////////////////////////////////// + // SCATTER + + StreamCompaction::Common::kernScatter << < numBlocks, blockSize >> > (n, dev_odata, dev_idata, dev_boolean_map, dev_indices); + cudaDeviceSynchronize(); + + //////////////////////////////////////////// + + + + timer().endGpuTimer(); + + // copy over last index value from gpu index array to local int to return + int last_index_val = 0; + int last_bool_val = 0; + cudaMemcpy(&last_index_val, dev_indices + old_n - 1, sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(&last_bool_val, dev_boolean_map + old_n - 1, sizeof(int), cudaMemcpyDeviceToHost); + cudaDeviceSynchronize(); + + int num_elements = last_index_val; + if (last_bool_val == 1) { + ++num_elements; + } + + // copy over array from device to output array on cpu, copying only original length data + cudaMemcpy(odata, dev_odata, sizeof(int) * old_n, cudaMemcpyDeviceToHost); + cudaDeviceSynchronize(); + + cudaFree(dev_idata); + cudaFree(dev_odata); + cudaFree(dev_boolean_map); + cudaFree(dev_indices); + cudaDeviceSynchronize(); + return num_elements; + } + + // + // RADIX SORT NOT FULLY IMPLEMENTED + // + + /** + * Performs radix sort idata, storing the result into odata. + * + * @param n The number of elements in idata. + * @param odata The array into which to store elements. + * @param idata The array of elements to sort. + */ + void radixSort(int n, int* odata, const int* idata) { + int* dev_i; + int* dev_b; + int* dev_e; + int* dev_f; + int* dev_t; + int* dev_d; + + // need to pad out original array to nearest power of two size + //int old_n = n; + //n = pow(2, ilog2ceil(n)); + + cudaMalloc((void**)&dev_i, n * sizeof(int)); + cudaMalloc((void**)&dev_b, n * sizeof(int)); + cudaMalloc((void**)&dev_e, n * sizeof(int)); + cudaMalloc((void**)&dev_f, n * sizeof(int)); + cudaMalloc((void**)&dev_t, n * sizeof(int)); + cudaMalloc((void**)&dev_d, n * sizeof(int)); + cudaDeviceSynchronize(); + + // copy original data to GPU + cudaMemcpy(dev_i, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + cudaDeviceSynchronize(); + timer().startGpuTimer(); - // TODO + + int numBlocks = (n + blockSize - 1) / blockSize; + + for (int b = 0; b < sizeof(int) * 8; ++b) { + //////////////////////////////////////////// + // MAP TO BOOLEAN + StreamCompaction::Common::kernMapToBooleanBitwiseCheck << < numBlocks, blockSize >> > (n, 1 << b, dev_e, dev_i); + cudaDeviceSynchronize(); + + //////////////////////////////////////////// + /* + // copy data from boolean map to index array for scan + cudaMemcpy(dev_f, dev_e, sizeof(int) * n, cudaMemcpyDeviceToDevice); + cudaDeviceSynchronize(); + // pad out rest of the expanded length with 0s + cudaMemset(dev_f + (old_n), 0, sizeof(int) * (n - old_n)); + + //////////////////////////////////////////// + // SCAN + + // UPSWEEP + for (int d = 0; d <= num_iterations - 1; ++d) { + // compute upsweep iteration d + kern_ComputeEfficientExclusiveScanUpSweepIteration << < numBlocks, blockSize >> > (n, pow(2, d), pow(2, d + 1), dev_indices); + cudaDeviceSynchronize(); + } + + // set last element in array to 0 + cudaMemset(dev_indices + n - 1, 0, sizeof(int)); + cudaDeviceSynchronize(); + + // DOWNSWEEP + for (int d = num_iterations - 1; d >= 0; --d) { + // compute downsweep iteration d + kern_ComputeEfficientExclusiveScanDownSweepIteration << < numBlocks, blockSize >> > (n, pow(2, d), pow(2, d + 1), dev_indices); + cudaDeviceSynchronize(); + } + + //////////////////////////////////////////// + */ + } + timer().endGpuTimer(); - return -1; + + // copy over array from device to output array on cpu, copying only original length data + cudaMemcpy(odata, dev_e, sizeof(int) * n, cudaMemcpyDeviceToHost); + cudaDeviceSynchronize(); + + cudaFree(dev_i); + cudaFree(dev_b); + cudaFree(dev_e); + cudaFree(dev_f); + cudaFree(dev_t); + cudaFree(dev_d); + cudaDeviceSynchronize(); } } } diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 803cb4f..a5fed4c 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -8,6 +8,12 @@ namespace StreamCompaction { void scan(int n, int *odata, const int *idata); + void scan_optimized(int n, int* odata, const int* idata); + + void scan_sharedMem(int n, int* odata, const int* idata); + int compact(int n, int *odata, const int *idata); + + void radixSort(int n, int* odata, const int* idata); } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 4308876..e4a7b20 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -11,15 +11,109 @@ namespace StreamCompaction { static PerformanceTimer timer; return timer; } - // TODO: __global__ + + __global__ void kern_ComputeExclusiveScanIteration(int N, int offset, int* odata, int* idata) { + int thread_num = threadIdx.x + (blockIdx.x * blockDim.x); + if (thread_num >= N) { + return; + } + + // warp 0 will have divergent branches on offest == 1 but that would happen if moved to seperate kernel anyways + if (offset == 1 && thread_num == 0) { + // SPECIAL CASE: first iteration and 0th element should be zeroed-out for exclusive scan + odata[thread_num] = 0; + } + else if (offset == 1 && thread_num == 1) { + // SPECIAL CASE: first iteration and 1st element should be equal to the original 0th element for exclusive scan + odata[thread_num] = idata[thread_num - 1]; + } + else if (offset == 1 && thread_num > 1) { + // SPECIAL CASE: first iteration elements past index 1 should use two previous values for summing + odata[thread_num] = idata[thread_num - 2] + idata[thread_num - 1]; + } + else if (thread_num < offset) { + // same as for inclusive + odata[thread_num] = idata[thread_num]; + } + else { + // same as for inclusive + odata[thread_num] = idata[thread_num - offset] + idata[thread_num]; + } + } + + __global__ void kern_ComputeInclusiveScanIteration(int N, int offset, int* odata, int* idata) { + int thread_num = threadIdx.x + (blockIdx.x * blockDim.x); + if (thread_num >= N) { + return; + } + + + if (thread_num < offset) { + odata[thread_num] = idata[thread_num]; + } + else { + odata[thread_num] = idata[thread_num - offset] + idata[thread_num]; + } + } + + __global__ void kern_InclusiveToExclusiveScan(int N, int* odata, int* idata) { + int thread_num = threadIdx.x + (blockIdx.x * blockDim.x); + if (thread_num >= N) { + return; + } + + if (thread_num == 0) { + odata[thread_num] = 0; + } + else { + odata[thread_num] = idata[thread_num - 1]; + } + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int* dev_odata_1; + int* dev_odata_2; + + cudaMalloc((void**)&dev_odata_1, n * sizeof(int)); + cudaMalloc((void**)&dev_odata_2, n * sizeof(int)); + cudaDeviceSynchronize(); + + cudaMemcpy(dev_odata_1, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + cudaDeviceSynchronize(); + timer().startGpuTimer(); - // TODO + + int numBlocks = (n + blockSize - 1) / blockSize; + + int num_iterations = ilog2ceil(n); + for (int d = 1; d <= num_iterations; ++d) { + // compute scan iteration d +#ifdef inclusiveToExclusive + kern_ComputeInclusiveScanIteration <<< numBlocks, blockSize >>> (n, pow(2, d - 1), dev_odata_2, dev_odata_1); +#else + kern_ComputeExclusiveScanIteration <<< numBlocks, blockSize >>> (n, pow(2, d - 1), dev_odata_2, dev_odata_1); +#endif + cudaDeviceSynchronize(); + int* temp = dev_odata_1; + dev_odata_1 = dev_odata_2; + dev_odata_2 = temp; + } + +#ifdef inclusiveToExclusive + kern_InclusiveToExclusiveScan <<< numBlocks, blockSize >>> (n, dev_odata_2, dev_odata_1); + cudaDeviceSynchronize(); +#endif timer().endGpuTimer(); + + cudaMemcpy(odata, dev_odata_1, sizeof(int) * n, cudaMemcpyDeviceToHost); + cudaDeviceSynchronize(); + + cudaFree(dev_odata_1); + cudaFree(dev_odata_2); + cudaDeviceSynchronize(); } } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e..2f5d8a5 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -18,11 +18,16 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + thrust::device_vector thrust_dv_idata(idata, idata + n); + thrust::device_vector thrust_dv_odata(odata, odata + n); + timer().startGpuTimer(); - // TODO use `thrust::exclusive_scan` - // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + + thrust::exclusive_scan(thrust_dv_idata.begin(), thrust_dv_idata.end(), thrust_dv_odata.begin()); + timer().endGpuTimer(); + + cudaMemcpy(odata, (thrust_dv_odata.data()).get(), sizeof(int) * n, cudaMemcpyDeviceToHost); } } }