diff --git a/README.md b/README.md index 0e38ddb..6e60a14 100644 --- a/README.md +++ b/README.md @@ -2,13 +2,93 @@ CUDA Stream Compaction ====================== **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** + +* Tom Donnelly + * [LinkedIn](https://www.linkedin.com/in/the-tom-donnelly/) +* Tested on: Windows 11, AMD Ryzen 9 5900X, NVIDIA GeForce RTX 3070 (Personal Desktop) + +### Description +An Implementation and Comparison of Stream Compaction and Scan Algorithms for Naive, Work Efficient, Naive, and Thrust implementations on the CPU and GPU. + +### Questions and Analysis + +## Roughly optimize the block sizes of each of your implementations for minimal run time on your GPU. +![](img/naive_blocksize.png) +![](img/Efficent_block_size.png) +Average runtimes were taken for 5 runs at an array of 1048576 elements. A block size of 128 was chosen for the Naive implementation and a block size of 64 was chosen for the efficient implementation. + +## Comparison + ![](img/pot_scan.png) + ![](img/npot_scan.png) + ![](img/compact.png) +After around 1048576 elements, the CPU implementation of Scan loses performance taking the most time out of any implementation. The faster implementation is thrust, followed by the work efficient and Naive implementations respectively. This trend occurs both in arrays of a power of two and non power of two arrays. The compact GPU implementation is much faster than the CPU at higher array values, having a 5.7X speedup at 268435456 +elements. +Thrust appears to be calling an asynchronous malloc and memcopy in CUDA to initialize device vectors. It is then calling DeviceScanInitKernal and DeviceScanKernal to implement the exclusive scan. It is likely using shared memory to perform parallel operations quickly. +### Explanation +All CPU implementations are serialized, meaning they are run one after the other an constrained by the speed of one thread on the CPU. The memory operations are quick so the implementation works well for small arrays but very quickly falls off for larger arrays. The Naive implementation is limited by the access pattern, it launches the maximum number of threads for each offset and is limited by the number of calculations being done. The work efficient implementation launches the minimum number of threads and is likely limited through I/O global memory on the GPU. It could be improved by using shared memory. The thrust implementation is the fastest and most likely hardware limited. + +``` + +**************** +** SCAN TESTS ** +**************** + [ 39 5 12 8 25 17 10 49 5 23 16 21 3 ... 34 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 0.0006ms (std::chrono Measured) + [ 0 39 44 56 64 89 106 116 165 170 193 209 230 ... 6408 6442 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 0.0003ms (std::chrono Measured) + [ 0 39 44 56 64 89 106 116 165 170 193 209 230 ... 6372 6383 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 0.077664ms (CUDA Measured) + [ 0 39 44 56 64 89 106 116 165 170 193 209 230 ... 6408 6442 ] + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.082944ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.169984ms (CUDA Measured) + [ 0 39 44 56 64 89 106 116 165 170 193 209 230 ... 6408 6442 ] + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.140256ms (CUDA Measured) + [ 0 39 44 56 64 89 106 116 165 170 193 209 230 ... 6372 6383 ] + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.052992ms (CUDA Measured) + [ 0 39 44 56 64 89 106 116 165 170 193 209 230 ... 6408 6442 ] + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.070656ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 3 3 2 0 1 3 0 3 3 3 0 1 1 ... 2 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 0.0006ms (std::chrono Measured) + [ 3 3 2 1 3 3 3 3 1 1 1 1 1 ... 1 2 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 0.0008ms (std::chrono Measured) + [ 3 3 2 1 3 3 3 3 1 1 1 1 1 ... 3 1 ] + passed +==== cpu compact with scan ==== + elapsed time: 0.0014ms (std::chrono Measured) + [ 3 3 2 1 3 3 3 3 1 1 1 1 1 ... 1 2 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 0.16896ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 0.218112ms (CUDA Measured) + passed +Press any key to continue . . . +``` + -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) -### (TODO: Your README) -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) diff --git a/img/Efficent_block_size.png b/img/Efficent_block_size.png new file mode 100644 index 0000000..14c8273 Binary files /dev/null and b/img/Efficent_block_size.png differ diff --git a/img/compact.png b/img/compact.png new file mode 100644 index 0000000..52fbf1e Binary files /dev/null and b/img/compact.png differ diff --git a/img/naive_blocksize.png b/img/naive_blocksize.png new file mode 100644 index 0000000..963ba70 Binary files /dev/null and b/img/naive_blocksize.png differ diff --git a/img/npot_scan.png b/img/npot_scan.png new file mode 100644 index 0000000..e35c639 Binary files /dev/null and b/img/npot_scan.png differ diff --git a/img/pot_scan.png b/img/pot_scan.png new file mode 100644 index 0000000..54467e0 Binary files /dev/null and b/img/pot_scan.png differ diff --git a/src/main.cpp b/src/main.cpp index 896ac2b..634fcc6 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -51,7 +51,7 @@ int main(int argc, char* argv[]) { printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan @@ -71,21 +71,21 @@ int main(int argc, char* argv[]) { printDesc("work-efficient scan, power-of-two"); StreamCompaction::Efficient::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, non-power-of-two"); StreamCompaction::Efficient::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("thrust scan, power-of-two"); StreamCompaction::Thrust::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d63..dcab2ed 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -18,12 +18,22 @@ void checkCUDAErrorFn(const char *msg, const char *file, int line) { namespace StreamCompaction { namespace Common { + __device__ int getIndex() + { + return threadIdx.x + (blockIdx.x * blockDim.x); + } + /** * Maps an array to an array of 0s and 1s for stream compaction. Elements * which map to 0 will be removed, and elements which map to 1 will be kept. */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index > n) + { + return; + } + bools[index] = (idata[index] != 0); } /** @@ -32,7 +42,18 @@ namespace StreamCompaction { */ __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { - // TODO + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index > n) + { + return; + } + if (bools[index] == 1) + { + int new_index = indices[index]; + odata[new_index] = idata[index]; + } + + } } diff --git a/stream_compaction/common.h b/stream_compaction/common.h index d2c1fed..ed4723e 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -32,6 +32,7 @@ inline int ilog2ceil(int x) { namespace StreamCompaction { namespace Common { + __device__ int getIndex(); __global__ void kernMapToBoolean(int n, int *bools, const int *idata); __global__ void kernScatter(int n, int *odata, diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa11..b3b4fa4 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -19,7 +19,17 @@ namespace StreamCompaction { */ void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + odata[0] = idata[0]; + for (int k = 1; k < n; k++) + { + odata[k] = odata[k - 1] + idata[k]; + } + //shift + for (int i = n - 1; i > 0; i--) + { + odata[i] = odata[i - 1]; + } + odata[0] = 0; timer().endCpuTimer(); } @@ -30,9 +40,17 @@ namespace StreamCompaction { */ int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + int num_elements = 0; + for (int i = 0; i < n; i++) + { + if (idata[i] != 0) + { + odata[num_elements] = idata[i]; + num_elements++; + } + } timer().endCpuTimer(); - return -1; + return num_elements; } /** @@ -42,9 +60,46 @@ namespace StreamCompaction { */ int compactWithScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + int* temp_array = new int[n]; + for (int i = 0; i < n; i++) + { + if (idata[i] != 0) + { + temp_array[i] = 1; + } + else + { + temp_array[i] = 0; + } + } + //Scan + odata[0] = temp_array[0]; + for (int k = 1; k < n; k++) + { + odata[k] = odata[k - 1] + temp_array[k]; + } + //shift + for (int i = n - 1; i > 0; i--) + { + odata[i] = odata[i - 1]; + } + odata[0] = 0; + + //Scatter + int num_elements = 0; + for (int i = 0; i < n; i++) + { + if (temp_array[i] == 1) + { + odata[odata[i]] = idata[i]; + num_elements++; + } + } + delete[] temp_array; + + timer().endCpuTimer(); - return -1; + return num_elements; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346e..fb0be41 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -12,13 +12,141 @@ namespace StreamCompaction { return timer; } + __global__ void kernUpSweepOpt(int n, int depth, int offset, int* data) + { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + int offset_1 = offset << 1; + int new_index = index * offset_1 + offset_1 - 1; + if (index > (n-1)) + { + return; + } + data[new_index] += data[index * offset_1 + offset - 1]; + + } + __global__ void kernDownSweepOpt(int n, int depth, int offset, int* data) + { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + int offset_1 = offset << 1; + int parent_i = index * offset_1 + offset_1 - 1; + int left_i = index * offset_1 + offset - 1; + if (index > (n-1)) + { + return; + } + if (n == 1) + { + data[parent_i] = 0; + } + int t = data[left_i]; + data[left_i] = data[parent_i]; + data[parent_i] += t; + + + } + + //__global__ void kernUpSweep(int n, int depth, int offset, int* data) + //{ + // int index = threadIdx.x + (blockIdx.x * blockDim.x); + // if (index > n) + // { + // return; + // } + // if (((index + 1) % (1 << (depth + 1))) == 0) + // { + // data[index] += data[index - offset]; + // } + + //} + //__global__ void kernDownSweep(int n, int depth, int offset, int* data, bool root) + //{ + // int index = threadIdx.x + (blockIdx.x * blockDim.x); + // if (index > n) + // { + // return; + // } + // if (index == (n-1) && root) + // { + // data[index] = 0; + // } + // if (((index + 1) % (1 << (depth + 1))) == 0) + // { + // int t = data[index - offset]; + // data[index - offset] = data[index]; + // data[index] += t; + // } + + //} + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ - void scan(int n, int *odata, const int *idata) { + //void scanNoOpt(int n, int *odata, const int *idata) { + // int blockSize = 128; + // int numBlocks = ((n + blockSize - 1) / blockSize); + // int* dev_data; + // float d = ilog2ceil(n); + // int pot = pow(2, d); + // cudaMalloc((void**)&dev_data, pot * sizeof(int)); + // cudaMemset(dev_data, 0, pot * sizeof(int)); + // cudaMemcpy(dev_data, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + // cudaDeviceSynchronize(); + // timer().startGpuTimer(); + // for (int depth = 0; depth < d; depth++) + // { + // int offset = 1 << depth; + // kernUpSweep << < numBlocks, blockSize >> > (pot, depth, offset, dev_data); + // + // } + // bool root = true; + // cudaDeviceSynchronize(); + // for (int depth = d - 1; depth >= 0; depth--) + // { + // int offset = 1 << depth; + // kernDownSweep << < numBlocks, blockSize >> > (pot, depth, offset, dev_data, root); + // root = false; + // } + // timer().endGpuTimer(); + // cudaMemcpy(odata, dev_data, sizeof(int) * n, cudaMemcpyDeviceToHost); + // cudaFree(dev_data); + //} + + void scan(int n, int* odata, const int* idata) { + int blockSize = 64; + int* dev_data; + int d = ilog2ceil(n); + int pot = 1 << d; + int num = pot; + cudaMalloc((void**)&dev_data, pot * sizeof(int)); + checkCUDAError("Malloc dev_data Failed! "); + cudaMemset(dev_data, 0, pot * sizeof(int)); + checkCUDAError("Memset dev_data Failed! "); + cudaMemcpy(dev_data, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + checkCUDAError("Memcpy dev_data Failed! "); + cudaDeviceSynchronize(); + timer().startGpuTimer(); - // TODO + + for (int depth = 0; depth < d; depth++) + { + num /= 2; + int offset = 1 << depth; + int numBlocks = ((num + blockSize - 1) / blockSize); + kernUpSweepOpt << < numBlocks, blockSize >> > (num, depth, offset, dev_data); + + } + cudaDeviceSynchronize(); + for (int depth = d - 1; depth >= 0; depth--) + { + int offset = 1 << depth; + int numBlocks = ((num + blockSize - 1) / blockSize); + kernDownSweepOpt << < numBlocks, blockSize >> > (num, depth, offset, dev_data); + num *= 2; + } timer().endGpuTimer(); + cudaMemcpy(odata, dev_data, sizeof(int) * n, cudaMemcpyDeviceToHost); + checkCUDAError("Memcpy dev_data back Failed! "); + cudaFree(dev_data); } /** @@ -31,10 +159,67 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { + int blockSize = 64; + int numBlocks = ((n + blockSize - 1) / blockSize); + int* dev_idata; + int* dev_odata; + int* dev_bools; + int* dev_indicies; + int d = ilog2ceil(n); + int pot = 1 << d; + int num = pot; + cudaMalloc((void**)&dev_idata, pot * sizeof(int)); + checkCUDAError("Malloc dev_idata Failed! "); + cudaMalloc((void**)&dev_odata, pot * sizeof(int)); + checkCUDAError("Malloc dev_odata Failed! "); + cudaMalloc((void**)&dev_bools, pot * sizeof(int)); + checkCUDAError("Malloc dev_bools Failed! "); + cudaMalloc((void**)&dev_indicies, pot * sizeof(int)); + checkCUDAError("Malloc dev_indices Failed! "); + cudaMemset(dev_idata, 0, pot * sizeof(int)); + checkCUDAError("Memset dev_idata Failed! "); + cudaMemset(dev_odata, 0, pot * sizeof(int)); + checkCUDAError("Memset dev_odata Failed! "); + cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + checkCUDAError("Memcpy dev_idata Failed! "); + cudaDeviceSynchronize(); timer().startGpuTimer(); - // TODO + + Common::kernMapToBoolean << < numBlocks, blockSize >> > (n, dev_bools, dev_idata); + cudaMemcpy(dev_indicies, dev_bools, sizeof(int) * n, cudaMemcpyDeviceToDevice); + for (int depth = 0; depth < d; depth++) + { + num /= 2; + int offset = 1 << depth; + int numBlocks = ((num + blockSize - 1) / blockSize); + kernUpSweepOpt << < numBlocks, blockSize >> > (num, depth, offset, dev_indicies); + + } + cudaDeviceSynchronize(); + for (int depth = d - 1; depth >= 0; depth--) + { + int offset = 1 << depth; + int numBlocks = ((num + blockSize - 1) / blockSize); + kernDownSweepOpt << < numBlocks, blockSize >> > (num, depth, offset, dev_indicies); + num *= 2; + } + Common::kernScatter << < numBlocks, blockSize >> > (n, dev_odata, dev_idata, dev_bools, dev_indicies); timer().endGpuTimer(); - return -1; + cudaMemcpy(odata, dev_odata, sizeof(int) * n, cudaMemcpyDeviceToHost); + checkCUDAError("Memcpy dev_odata Failed! "); + int num_elements = 0; + for (int i = 0; i < n; i++) + { + if (odata[i] != 0) + { + num_elements++; + } + } + cudaFree(dev_idata); + cudaFree(dev_odata); + cudaFree(dev_bools); + cudaFree(dev_indicies); + return num_elements; } } } diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 803cb4f..bb638a2 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -5,7 +5,7 @@ namespace StreamCompaction { namespace Efficient { StreamCompaction::Common::PerformanceTimer& timer(); - + __global__ void kernScan(int n, int* data); void scan(int n, int *odata, const int *idata); int compact(int n, int *odata, const int *idata); diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 4308876..659df8c 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -12,14 +12,66 @@ namespace StreamCompaction { return timer; } // TODO: __global__ + __global__ void kernScan(int n, int depth, int offset, int* odata, const int* idata) + { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) + { + return; + } + if (index == 0) + { + odata[index] = 0; + } + if (index >= offset) + { + odata[index] = idata[index - offset] + idata[index]; + } + else + { + odata[index] = idata[index]; + } + + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int blockSize = 128; + int numBlocks = ((n + blockSize - 1) / blockSize); + int* dev_outDataA; + int* dev_outDataB; + cudaMalloc((void**)&dev_outDataA, n * sizeof(int)); + checkCUDAError("Malloc dev_outDataA Failed! "); + cudaMalloc((void**)&dev_outDataB, n * sizeof(int)); + checkCUDAError("Malloc dev_outDataB Failed! "); + //cudaMemset(dev_outDataA, 0, n * sizeof(int)); + checkCUDAError("Memset dev_outDataA Failed! "); + cudaDeviceSynchronize(); + cudaMemcpy(dev_outDataA, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + checkCUDAError("Memcpy dev_outDataA Failed! "); + cudaDeviceSynchronize(); timer().startGpuTimer(); - // TODO + int d = ilog2ceil(n); + for (int k = 1; k <= d; k ++) + { + int offset = 1 << (k - 1); + kernScan <<< numBlocks, blockSize >>> (n, k, offset, dev_outDataB, dev_outDataA); + std::swap(dev_outDataB, dev_outDataA); + } timer().endGpuTimer(); + cudaMemcpy(odata, dev_outDataA, sizeof(int) * n, cudaMemcpyDeviceToHost); + checkCUDAError("Memcpy Back dev_outDataA Failed! "); + //shift + for (int i = n - 1; i > 0; i--) + { + odata[i] = odata[i - 1]; + } + odata[0] = 0; + cudaFree(dev_outDataA); + cudaFree(dev_outDataB); + } } -} +} \ No newline at end of file diff --git a/stream_compaction/naive.h b/stream_compaction/naive.h index 37dcb06..ba2f629 100644 --- a/stream_compaction/naive.h +++ b/stream_compaction/naive.h @@ -5,6 +5,7 @@ namespace StreamCompaction { namespace Naive { StreamCompaction::Common::PerformanceTimer& timer(); + __global__ void kernScan(int n, int* odata, const int* data); void scan(int n, int *odata, const int *idata); } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e..c1bafd9 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -18,11 +18,20 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + thrust::device_vector dev_thrust_idata(idata, idata+n); + thrust::host_vector host_thrust_odata(odata, odata + n); + thrust::device_vector dev_thrust_odata(host_thrust_odata); timer().startGpuTimer(); // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::exclusive_scan(dev_thrust_idata.begin(), dev_thrust_idata.end(), dev_thrust_odata.begin()); timer().endGpuTimer(); + host_thrust_odata = dev_thrust_odata; + for (int i=0; i < n; i++) + { + odata[i] = host_thrust_odata[i]; + } } } }