From 60ce94eb3379a39b70e526e92625f45ea228d937 Mon Sep 17 00:00:00 2001 From: Kristi Belcher Date: Thu, 23 Jan 2025 10:21:46 -0800 Subject: [PATCH] trying to get the best benchmarks... --- benchmarks/old_rap_benchmark.cpp | 167 ----------------- benchmarks/rap_benchmark.cpp | 303 ++++++++++++++++++++++++++----- 2 files changed, 255 insertions(+), 215 deletions(-) delete mode 100644 benchmarks/old_rap_benchmark.cpp diff --git a/benchmarks/old_rap_benchmark.cpp b/benchmarks/old_rap_benchmark.cpp deleted file mode 100644 index 9ca157027..000000000 --- a/benchmarks/old_rap_benchmark.cpp +++ /dev/null @@ -1,167 +0,0 @@ -#include -#include -#include - -#include -#include - -#include "camp/camp.hpp" -#include "umpire/ResourceManager.hpp" -#include "umpire/Umpire.hpp" -#include "umpire/strategy/ResourceAwarePool.hpp" -#include "umpire/strategy/QuickPool.hpp" - -#if defined(UMPIRE_ENABLE_CUDA) -using resource_type = camp::resources::Cuda; -#elif defined(UMPIRE_ENABLE_HIP) -using resource_type = camp::resources::Hip; -#endif - -constexpr int NUM_ALLOC = 100; -const int NUM_RES = 4; -constexpr int SIZE = 1 << 21; -const int NUM_PER_BLOCK = 256; -const int NUM_BLOCKS = SIZE/NUM_PER_BLOCK; - -__global__ void touch_data(double* data, int i) -{ - int id = blockIdx.x * blockDim.x + threadIdx.x; - - if (id < SIZE) { - data[id] = id + i; - } -} - -int main(int, char**) -{ - auto& rm = umpire::ResourceManager::getInstance(); - auto rap_pool = rm.makeAllocator("rap-pool", rm.getAllocator("UM")); - auto qp_pool = rm.makeAllocator("qp-pool", rm.getAllocator("UM")); - double* a; - - // Create camp resources for device streams - std::vector resources(NUM_RES); - - std::cout<<"Timing " << NUM_ALLOC << " allocations and " << NUM_RES << " resources with the QuickPool ...."< duration_total[NUM_ALLOC]; - std::chrono::duration this_duration_total[NUM_ALLOC]; - std::chrono::time_point my_start, my_end; - - for (int r = 0; r < NUM_RES; r++) { - for (int i = 0; i < NUM_ALLOC; i++) { - auto start_total = std::chrono::high_resolution_clock::now(); - a = static_cast(qp_pool.allocate(SIZE * sizeof(double))); - touch_data<<>>(a, i); - qp_pool.deallocate(a); - auto end_total = std::chrono::high_resolution_clock::now(); - duration_total[i] = end_total - start_total; - } - - // Calculate average, max, and min durations - double total_duration = 0.0; - for (const auto& duration : duration_total) { - total_duration += duration.count(); - } - double average_duration = total_duration / NUM_ALLOC; - - auto min_duration = *std::min_element(duration_total, duration_total + NUM_ALLOC); - auto max_duration = *std::max_element(duration_total, duration_total + NUM_ALLOC); - - std::cout << "Resource " << r << " statistics:" << std::endl; - std::cout << "Average execution time: " << (average_duration * 1000.0) << " milliseconds" << std::endl; - std::cout << "Minimum execution time: " << (min_duration.count() * 1000.0) << " milliseconds" << std::endl; - std::cout << "Maximum execution time: " << (max_duration.count() * 1000.0) << " milliseconds" << std::endl; - } - - std::cout<< std::endl; - std::cout<<"Timing " << NUM_ALLOC << " allocations ACROSS " << NUM_RES << " resources with the QuickPool ...."<(qp_pool.allocate(SIZE * sizeof(double))); - touch_data<<>>(a, i); - qp_pool.deallocate(a); - my_end = std::chrono::high_resolution_clock::now(); - this_duration_total[i] = my_end - my_start; - } - - // Calculate average, max, and min durations - double total_duration = 0.0; - for (const auto& duration : this_duration_total) { - total_duration += duration.count(); - } - double average_duration = total_duration / NUM_ALLOC; - - auto min_duration = *std::min_element(duration_total, duration_total + NUM_ALLOC); - auto max_duration = *std::max_element(duration_total, duration_total + NUM_ALLOC); - - std::cout << "Average execution time: " << (average_duration * 1000.0) << " milliseconds" << std::endl; - std::cout << "Minimum execution time: " << (min_duration.count() * 1000.0) << " milliseconds" << std::endl; - std::cout << "Maximum execution time: " << (max_duration.count() * 1000.0) << " milliseconds" << std::endl; - } - - std::cout<< std::endl; - std::cout<<"Timing " << NUM_ALLOC << " allocations and " << NUM_RES << " resources with the ResourceAwarePool ...."<(rap_pool.allocate(SIZE * sizeof(double), resources[r])); - touch_data<<>>(a, i); - rap_pool.deallocate(a); - auto end_total = std::chrono::high_resolution_clock::now(); - duration_total[i] = end_total - start_total; - } - - // Calculate average, max, and min durations - double total_duration = 0.0; - for (const auto& duration : duration_total) { - total_duration += duration.count(); - } - double average_duration = total_duration / NUM_ALLOC; - - auto min_duration = *std::min_element(duration_total, duration_total + NUM_ALLOC); - auto max_duration = *std::max_element(duration_total, duration_total + NUM_ALLOC); - - std::cout << "Resource " << r << " statistics:" << std::endl; - std::cout << "Average execution time: " << (average_duration * 1000.0) << " milliseconds" << std::endl; - std::cout << "Minimum execution time: " << (min_duration.count() * 1000.0) << " milliseconds" << std::endl; - std::cout << "Maximum execution time: " << (max_duration.count() * 1000.0) << " milliseconds" << std::endl; - } - - std::cout<< std::endl; - std::cout<<"Timing " << NUM_ALLOC << " allocations ACROSS " << NUM_RES << " resources with the ResourceAwarePool ...."<(rap_pool.allocate(SIZE * sizeof(double), resources[ri])); - touch_data<<>>(a, i); - rap_pool.deallocate(a); - my_end = std::chrono::high_resolution_clock::now(); - this_duration_total[i] = my_end - my_start; - } - - // Calculate average, max, and min durations - double total_duration = 0.0; - for (const auto& duration : this_duration_total) { - total_duration += duration.count(); - } - double average_duration = total_duration / NUM_ALLOC; - - auto min_duration = *std::min_element(duration_total, duration_total + NUM_ALLOC); - auto max_duration = *std::max_element(duration_total, duration_total + NUM_ALLOC); - - std::cout << "Average execution time: " << (average_duration * 1000.0) << " milliseconds" << std::endl; - std::cout << "Minimum execution time: " << (min_duration.count() * 1000.0) << " milliseconds" << std::endl; - std::cout << "Maximum execution time: " << (max_duration.count() * 1000.0) << " milliseconds" << std::endl; - } - - return 0; -} - diff --git a/benchmarks/rap_benchmark.cpp b/benchmarks/rap_benchmark.cpp index cf6ac3cd8..a56e0eec9 100644 --- a/benchmarks/rap_benchmark.cpp +++ b/benchmarks/rap_benchmark.cpp @@ -17,76 +17,283 @@ using resource_type = camp::resources::Cuda; using resource_type = camp::resources::Hip; #endif -constexpr int NUM_ALLOC = 100; constexpr int SIZE = 1 << 18; -//const int NUM_PER_BLOCK = 256; -//const int NUM_BLOCKS = SIZE/NUM_PER_BLOCK; -__global__ void touch_data(double* data, int i) +void test_rap() { - int id = blockIdx.x * blockDim.x + threadIdx.x; + auto& rm = umpire::ResourceManager::getInstance(); + auto pool = rm.makeAllocator("rap-pool", rm.getAllocator("DEVICE")); + + int ROUNDS[5] = {16, 32, 64, 128, 256}; + + for(int f = 0; f < 5; f++) { + const int NUM_ALLOC = ROUNDS[f]; + std::cout << std::endl << "Number of Allocations: " << ROUNDS[f] << std::endl; + + double* a[NUM_ALLOC]; + + // Create camp resources for device streams + resource_type d1, d2; + + //Fill the pending list + auto start_total1a = std::chrono::high_resolution_clock::now(); + for( int i = 0; i < NUM_ALLOC; i ++) { + a[i] = static_cast(pool.allocate(SIZE * sizeof(double), d1)); + } + auto end_total1a = std::chrono::high_resolution_clock::now(); + std::chrono::duration duration_total1a = end_total1a - start_total1a; + std::cout << "Execution time for allocating: " << (duration_total1a.count() / NUM_ALLOC * 1000.0) << " milliseconds" << std::endl; + + auto start_total1 = std::chrono::high_resolution_clock::now(); + for( int i = 0; i < NUM_ALLOC; i ++) { + pool.deallocate(a[i], d1); + } + auto end_total1 = std::chrono::high_resolution_clock::now(); + std::chrono::duration duration_total1 = end_total1 - start_total1; + std::cout << "Execution time for deallocating: " << (duration_total1.count() / NUM_ALLOC * 1000.0) << " milliseconds" << std::endl; + + //Reallocate with the same resource + { + auto start_total = std::chrono::high_resolution_clock::now(); + for( int i = 0; i < NUM_ALLOC; i ++) { + a[i] = static_cast(pool.allocate(SIZE * sizeof(double), d1)); + } + auto end_total = std::chrono::high_resolution_clock::now(); + std::chrono::duration duration_total1 = end_total - start_total; + std::cout << "Total execution time for reallocating with the SAME resource: " << (duration_total1.count() / NUM_ALLOC * 1000.0) << " milliseconds" << std::endl; + } + + auto start_total2 = std::chrono::high_resolution_clock::now(); + for( int i = 0; i < NUM_ALLOC; i ++) { + pool.deallocate(a[i], d1); + } + auto end_total2 = std::chrono::high_resolution_clock::now(); + std::chrono::duration duration_total2 = end_total2 - start_total2; + std::cout << "Execution time for deallocating: " << (duration_total2.count() / NUM_ALLOC * 1000.0) << " milliseconds" << std::endl; + + //Fill the pending list + auto start_total2a = std::chrono::high_resolution_clock::now(); + for( int i = 0; i < NUM_ALLOC; i ++) { + a[i] = static_cast(pool.allocate(SIZE * sizeof(double), d1)); + } + auto end_total2a = std::chrono::high_resolution_clock::now(); + std::chrono::duration duration_total2a = end_total2a - start_total2a; + std::cout << "Execution time for allocating: " << (duration_total2a.count() / NUM_ALLOC * 1000.0) << " milliseconds" << std::endl; - if (id < SIZE) { - data[id] = id + i; + auto start_total3 = std::chrono::high_resolution_clock::now(); + for( int i = 0; i < NUM_ALLOC; i ++) { + pool.deallocate(a[i], d1); + } + auto end_total3 = std::chrono::high_resolution_clock::now(); + std::chrono::duration duration_total3 = end_total3 - start_total3; + std::cout << "Execution time for deallocating: " << (duration_total3.count() / NUM_ALLOC * 1000.0) << " milliseconds" << std::endl; + + //Reallocate with different resource + { + auto start_total = std::chrono::high_resolution_clock::now(); + for( int i = 0; i < NUM_ALLOC; i++) { + a[i] = static_cast(pool.allocate(SIZE * sizeof(double), d2)); + } + auto end_total = std::chrono::high_resolution_clock::now(); + std::chrono::duration duration_total1 = end_total - start_total; + std::cout << "Total execution time for reallocating with a DIFFERENT resource: " << (duration_total1.count() / NUM_ALLOC * 1000.0) << " milliseconds" << std::endl; + } + + auto start_total4 = std::chrono::high_resolution_clock::now(); + for( int i = 0; i < NUM_ALLOC; i ++) { + pool.deallocate(a[i], d2); + } + auto end_total4 = std::chrono::high_resolution_clock::now(); + std::chrono::duration duration_total4 = end_total4 - start_total4; + std::cout << "Execution time for deallocating: " << (duration_total4.count() / NUM_ALLOC * 1000.0) << " milliseconds" << std::endl; } } -int main(int, char**) +void test_qp() { auto& rm = umpire::ResourceManager::getInstance(); - auto pool = rm.makeAllocator("rap-pool", rm.getAllocator("UM")); - //auto pool = rm.makeAllocator("qp-pool", rm.getAllocator("UM")); + auto pool = rm.makeAllocator("qp-pool", rm.getAllocator("DEVICE")); - double* a[NUM_ALLOC]; + int ROUNDS[5] = {16, 32, 64, 128, 256}; - // Create camp resources for device streams - resource_type d1, d2; + for(int f = 0; f < 5; f++) { + const int NUM_ALLOC = ROUNDS[f]; + std::cout << std::endl << "Number of Allocations: " << ROUNDS[f] << std::endl; - //Fill the pending list - for( int i = 0; i < NUM_ALLOC; i ++) { - a[i] = static_cast(pool.allocate(SIZE * sizeof(double), d1)); - } - for( int i = 0; i < NUM_ALLOC; i ++) { - pool.deallocate(a[i]); - } + double* a[NUM_ALLOC]; - //Reallocate with the same resource - { - auto start_total = std::chrono::high_resolution_clock::now(); + //Fill the pending list + auto start_total1a = std::chrono::high_resolution_clock::now(); for( int i = 0; i < NUM_ALLOC; i ++) { - a[i] = static_cast(pool.allocate(SIZE * sizeof(double), d1)); + a[i] = static_cast(pool.allocate(SIZE * sizeof(double))); } - auto end_total = std::chrono::high_resolution_clock::now(); - std::chrono::duration duration_total1 = end_total - start_total; - std::cout << "Total execution time for reallocating with the SAME resource: " << (duration_total1.count() / NUM_ALLOC * 1000.0) << " milliseconds" << std::endl; - } + auto end_total1a = std::chrono::high_resolution_clock::now(); + std::chrono::duration duration_total1a = end_total1a - start_total1a; + std::cout << "Execution time for allocating: " << (duration_total1a.count() / NUM_ALLOC * 1000.0) << " milliseconds" << std::endl; - for( int i = 0; i < NUM_ALLOC; i ++) { - pool.deallocate(a[i]); - } + auto start_total1 = std::chrono::high_resolution_clock::now(); + for( int i = 0; i < NUM_ALLOC; i ++) { + pool.deallocate(a[i]); + } + auto end_total1 = std::chrono::high_resolution_clock::now(); + std::chrono::duration duration_total1 = end_total1 - start_total1; + std::cout << "Execution time for deallocating1: " << (duration_total1.count() / NUM_ALLOC * 1000.0) << " milliseconds" << std::endl; + + //Reallocate with the same resource + { + auto start_total = std::chrono::high_resolution_clock::now(); + for( int i = 0; i < NUM_ALLOC; i ++) { + a[i] = static_cast(pool.allocate(SIZE * sizeof(double))); + } + auto end_total = std::chrono::high_resolution_clock::now(); + std::chrono::duration duration_total1 = end_total - start_total; + std::cout << "Total execution time for reallocating with the SAME resource: " << (duration_total1.count() / NUM_ALLOC * 1000.0) << " milliseconds" << std::endl; + } + + auto start_total2 = std::chrono::high_resolution_clock::now(); + for( int i = 0; i < NUM_ALLOC; i ++) { + pool.deallocate(a[i]); + } + auto end_total2 = std::chrono::high_resolution_clock::now(); + std::chrono::duration duration_total2 = end_total2 - start_total2; + std::cout << "Execution time for deallocating2: " << (duration_total2.count() / NUM_ALLOC * 1000.0) << " milliseconds" << std::endl; - //Fill the pending list - for( int i = 0; i < NUM_ALLOC; i ++) { - a[i] = static_cast(pool.allocate(SIZE * sizeof(double), d1)); - } - for( int i = 0; i < NUM_ALLOC; i ++) { - pool.deallocate(a[i]); - } + //Fill the pending list + auto start_total2a = std::chrono::high_resolution_clock::now(); + for( int i = 0; i < NUM_ALLOC; i ++) { + a[i] = static_cast(pool.allocate(SIZE * sizeof(double))); + } + auto end_total2a = std::chrono::high_resolution_clock::now(); + std::chrono::duration duration_total2a = end_total2a - start_total2a; + std::cout << "Execution time for allocating: " << (duration_total2a.count() / NUM_ALLOC * 1000.0) << " milliseconds" << std::endl; + + auto start_total3 = std::chrono::high_resolution_clock::now(); + for( int i = 0; i < NUM_ALLOC; i ++) { + pool.deallocate(a[i]); + } + auto end_total3 = std::chrono::high_resolution_clock::now(); + std::chrono::duration duration_total3 = end_total3 - start_total3; + std::cout << "Execution time for deallocating3: " << (duration_total3.count() / NUM_ALLOC * 1000.0) << " milliseconds" << std::endl; - //Reallocate with different resource - { - auto start_total = std::chrono::high_resolution_clock::now(); - for( int i = 0; i < NUM_ALLOC; i++) { - a[i] = static_cast(pool.allocate(SIZE * sizeof(double), d2)); + //Reallocate with different resource + { + auto start_total = std::chrono::high_resolution_clock::now(); + for( int i = 0; i < NUM_ALLOC; i++) { + a[i] = static_cast(pool.allocate(SIZE * sizeof(double))); + } + auto end_total = std::chrono::high_resolution_clock::now(); + std::chrono::duration duration_total1 = end_total - start_total; + std::cout << "Total execution time for reallocating with a DIFFERENT resource: " << (duration_total1.count() / NUM_ALLOC * 1000.0) << " milliseconds" << std::endl; } - auto end_total = std::chrono::high_resolution_clock::now(); - std::chrono::duration duration_total1 = end_total - start_total; - std::cout << "Total execution time for reallocating with a DIFFERENT resource: " << (duration_total1.count() / NUM_ALLOC * 1000.0) << " milliseconds" << std::endl; + + auto start_total4 = std::chrono::high_resolution_clock::now(); + for( int i = 0; i < NUM_ALLOC; i ++) { + pool.deallocate(a[i]); + } + auto end_total4 = std::chrono::high_resolution_clock::now(); + std::chrono::duration duration_total4 = end_total4 - start_total4; + std::cout << "Execution time for deallocating4: " << (duration_total4.count() / NUM_ALLOC * 1000.0) << " milliseconds" << std::endl; } +} - for( int i = 0; i < NUM_ALLOC; i ++) { - pool.deallocate(a[i]); +void test_device_alloc() +{ + auto& rm = umpire::ResourceManager::getInstance(); + auto not_pool = rm.getAllocator("DEVICE"); + + int ROUNDS[5] = {16, 32, 64, 128, 256}; + + for(int f = 0; f < 5; f++) { + const int NUM_ALLOC = ROUNDS[f]; + std::cout << std::endl << "Number of Allocations: " << ROUNDS[f] << std::endl; + + double* a[NUM_ALLOC]; + + //Fill the pending list + auto start_total1a = std::chrono::high_resolution_clock::now(); + for( int i = 0; i < NUM_ALLOC; i ++) { + a[i] = static_cast(not_pool.allocate(SIZE * sizeof(double))); + } + auto end_total1a = std::chrono::high_resolution_clock::now(); + std::chrono::duration duration_total1a = end_total1a - start_total1a; + std::cout << "Execution time for allocating: " << (duration_total1a.count() / NUM_ALLOC * 1000.0) << " milliseconds" << std::endl; + + auto start_total1 = std::chrono::high_resolution_clock::now(); + for( int i = 0; i < NUM_ALLOC; i ++) { + not_pool.deallocate(a[i]); + } + auto end_total1 = std::chrono::high_resolution_clock::now(); + std::chrono::duration duration_total1 = end_total1 - start_total1; + std::cout << "Execution time for deallocating1: " << (duration_total1.count() / NUM_ALLOC * 1000.0) << " milliseconds" << std::endl; + + //Reallocate with the same resource + { + auto start_total = std::chrono::high_resolution_clock::now(); + for( int i = 0; i < NUM_ALLOC; i ++) { + a[i] = static_cast(not_pool.allocate(SIZE * sizeof(double))); + } + auto end_total = std::chrono::high_resolution_clock::now(); + std::chrono::duration duration_total1 = end_total - start_total; + std::cout << "Total execution time for reallocating with the SAME resource: " << (duration_total1.count() / NUM_ALLOC * 1000.0) << " milliseconds" << std::endl; + } + + auto start_total2 = std::chrono::high_resolution_clock::now(); + for( int i = 0; i < NUM_ALLOC; i ++) { + not_pool.deallocate(a[i]); + } + auto end_total2 = std::chrono::high_resolution_clock::now(); + std::chrono::duration duration_total2 = end_total2 - start_total2; + std::cout << "Execution time for deallocating2: " << (duration_total2.count() / NUM_ALLOC * 1000.0) << " milliseconds" << std::endl; + + //Fill the pending list + auto start_total2a = std::chrono::high_resolution_clock::now(); + for( int i = 0; i < NUM_ALLOC; i ++) { + a[i] = static_cast(not_pool.allocate(SIZE * sizeof(double))); + } + auto end_total2a = std::chrono::high_resolution_clock::now(); + std::chrono::duration duration_total2a = end_total2a - start_total2a; + std::cout << "Execution time for allocating: " << (duration_total2a.count() / NUM_ALLOC * 1000.0) << " milliseconds" << std::endl; + + auto start_total3 = std::chrono::high_resolution_clock::now(); + for( int i = 0; i < NUM_ALLOC; i ++) { + not_pool.deallocate(a[i]); + } + auto end_total3 = std::chrono::high_resolution_clock::now(); + std::chrono::duration duration_total3 = end_total3 - start_total3; + std::cout << "Execution time for deallocating3: " << (duration_total3.count() / NUM_ALLOC * 1000.0) << " milliseconds" << std::endl; + + //Reallocate with different resource + { + auto start_total = std::chrono::high_resolution_clock::now(); + for( int i = 0; i < NUM_ALLOC; i++) { + a[i] = static_cast(not_pool.allocate(SIZE * sizeof(double))); + } + auto end_total = std::chrono::high_resolution_clock::now(); + std::chrono::duration duration_total1 = end_total - start_total; + std::cout << "Total execution time for reallocating with a DIFFERENT resource: " << (duration_total1.count() / NUM_ALLOC * 1000.0) << " milliseconds" << std::endl; + } + + auto start_total4 = std::chrono::high_resolution_clock::now(); + for( int i = 0; i < NUM_ALLOC; i ++) { + not_pool.deallocate(a[i]); + } + auto end_total4 = std::chrono::high_resolution_clock::now(); + std::chrono::duration duration_total4 = end_total4 - start_total4; + std::cout << "Execution time for deallocating4: " << (duration_total4.count() / NUM_ALLOC * 1000.0) << " milliseconds" << std::endl; } +} + +int main(int, char**) +{ + std::cout << "--------Starting ResourceAwarePool tests--------" << std::endl; + test_rap(); + std::cout << "----------------" << std::endl; + std::cout << "--------Starting QuickPool tests--------" << std::endl; + test_qp(); + std::cout << "----------------" << std::endl; + std::cout << "--------Starting DEVICE alloc tests--------" << std::endl; + test_device_alloc(); + std::cout << "----------------" << std::endl; return 0; }