diff --git a/include/hashinator/hashinator.h b/include/hashinator/hashinator.h index e2a37e4..2b6fa39 100644 --- a/include/hashinator/hashinator.h +++ b/include/hashinator/hashinator.h @@ -1183,11 +1183,11 @@ class Hashmap { buckets, elements, rule, nBlocks, mPool, s); return retval; } - template - void extractPatternLoop(split::SplitVector>& elements, Rule rule, + template >> + void extractPatternLoop(split::SplitVector, ALLOCATOR>& elements, Rule rule, split_gpuStream_t s = 0) { // Extract elements matching the Pattern Rule(element)==true; - split::tools::copy_if_loop, Rule, defaults::MAX_BLOCKSIZE, defaults::WARPSIZE>( + split::tools::copy_if_loop, Rule, ALLOCATOR, defaults::MAX_BLOCKSIZE, defaults::WARPSIZE>( *device_buckets, elements, rule, s); } void extractLoop(split::SplitVector>& elements, split_gpuStream_t s = 0) { @@ -1226,10 +1226,10 @@ class Hashmap { defaults::WARPSIZE>(buckets, elements, rule, stack, max_size, s); return elements.size(); } - template - void extractKeysByPatternLoop(split::SplitVector& elements, Rule rule, split_gpuStream_t s = 0) { + template >> + void extractKeysByPatternLoop(split::SplitVector& elements, Rule rule, split_gpuStream_t s = 0) { // Extract element **keys** matching the Pattern Rule(element)==true; - split::tools::copy_if_keys_loop, KEY_TYPE, Rule, defaults::MAX_BLOCKSIZE, + split::tools::copy_if_keys_loop, KEY_TYPE, Rule, ALLOCATOR, defaults::MAX_BLOCKSIZE, defaults::WARPSIZE>(*device_buckets, elements, rule, s); } @@ -1250,7 +1250,8 @@ class Hashmap { }; return extractKeysByPattern(elements, rule, stack, max_size, s); } - void extractAllKeysLoop(split::SplitVector& elements, split_gpuStream_t s = 0) { + template >> + void extractAllKeysLoop(split::SplitVector& elements, split_gpuStream_t s = 0) { // Extract all keys auto rule = [] __host__ __device__(const hash_pair& kval) -> bool { return kval.first != EMPTYBUCKET && kval.first != TOMBSTONE; diff --git a/include/splitvector/split_tools.h b/include/splitvector/split_tools.h index 045a624..838a749 100644 --- a/include/splitvector/split_tools.h +++ b/include/splitvector/split_tools.h @@ -715,9 +715,9 @@ __global__ void block_compact_keys(T* input, U* output, size_t inputSize, Rule r } } -template +template __global__ void loop_compact(split::SplitVector>& inputVec, - split::SplitVector>& outputVec, Rule rule) { + split::SplitVector& outputVec, Rule rule) { // This must be equal to at least both WARPLENGTH and MAX_BLOCKSIZE/WARPLENGTH __shared__ uint32_t warpSums[WARPLENGTH]; __shared__ uint32_t outputCount; @@ -797,9 +797,9 @@ __global__ void loop_compact(split::SplitVector +template __global__ void loop_compact_keys(split::SplitVector>& inputVec, - split::SplitVector>& outputVec, Rule rule) { + split::SplitVector& outputVec, Rule rule) { // This must be equal to at least both WARPLENGTH and MAX_BLOCKSIZE/WARPLENGTH __shared__ uint32_t warpSums[WARPLENGTH]; __shared__ uint32_t outputCount; @@ -988,6 +988,24 @@ uint32_t copy_if_raw(T* input, T* output, size_t size, Rule rule, size_t nBlocks return numel; } +/** + * @brief Extraction routines using just a single block. + These methods assume splitvectors are fully allocated on UM or Device. + */ + +template +void copy_if_loop(split::SplitVector>& input, + split::SplitVector& output, Rule rule, + split_gpuStream_t s = 0) { +#ifdef HASHINATOR_DEBUG + bool input_ok = isDeviceAccessible(reinterpret_cast(&input)); + bool output_ok = isDeviceAccessible(reinterpret_cast(&output)); + assert((input_ok && output_ok) && + "This method supports splitvectors dynamically allocated on device or unified memory!"); +#endif + split::tools::loop_compact<<<1, BLOCKSIZE, 0, s>>>(input, output, rule); +} + /** * @brief Extraction routines using just a single block. These methods assume splitvectors are fully allocated on UM or Device. @@ -1006,6 +1024,19 @@ void copy_if_loop(split::SplitVector>& inpu split::tools::loop_compact<<<1, BLOCKSIZE, 0, s>>>(input, output, rule); } +template +void copy_if_keys_loop(split::SplitVector>& input, + split::SplitVector& output, Rule rule, + split_gpuStream_t s = 0) { +#ifdef HASHINATOR_DEBUG + bool input_ok = isDeviceAccessible(reinterpret_cast(&input)); + bool output_ok = isDeviceAccessible(reinterpret_cast(&output)); + assert((input_ok && output_ok) && + "This method supports splitvectors dynamically allocated on device or unified memory!"); +#endif + split::tools::loop_compact_keys<<<1, BLOCKSIZE, 0, s>>>(input, output, rule); +} + template void copy_if_keys_loop(split::SplitVector>& input, split::SplitVector>& output, Rule rule, diff --git a/unit_tests/Makefile b/unit_tests/Makefile index 2cf0177..21707e5 100644 --- a/unit_tests/Makefile +++ b/unit_tests/Makefile @@ -7,7 +7,7 @@ EXTRA= --std=c++17 EXTRA+= -gencode arch=compute_60,code=sm_60 EXTRA+= -DHASHMAPDEBUG --expt-relaxed-constexpr --expt-extended-lambda -lpthread GTEST= -L/home/kstppd/libs/googletest/build/lib -I/home/kstppd/libs/googletest/googletest/include -lgtest -lgtest_main -lpthread -OBJ= gtest_vec_host.o gtest_vec_device.o gtest_hashmap.o stream_compaction.o stream_compaction2.o delete_mechanism.o insertion_mechanism.o hybrid_cpu.o hybrid_gpu.o pointer_test.o benchmark.o benchmarkLF.o tbPerf.o realistic.o preallocated.o +OBJ= gtest_vec_host.o gtest_vec_device.o gtest_hashmap.o stream_compaction.o stream_compaction2.o custom_allocator.o delete_mechanism.o insertion_mechanism.o hybrid_cpu.o hybrid_gpu.o pointer_test.o benchmark.o benchmarkLF.o tbPerf.o realistic.o preallocated.o default: tests @@ -63,6 +63,9 @@ preallocated.o: stream_compaction/preallocated.cu stream_compaction2.o: stream_compaction/unit.cu ${CC} ${CXXFLAGS} ${OPT} ${EXTRA} ${GTEST} -o compaction2 stream_compaction/unit.cu +custom_allocator.o: custom_allocator/unit.cu + ${CC} ${CXXFLAGS} ${OPT} ${EXTRA} ${GTEST} -o compaction2 stream_compaction/unit.cu + delete_mechanism.o: delete_by_compaction/main.cu ${CC} ${CXXFLAGS} ${OPT} ${EXTRA} ${GTEST} -o delete_mechanism delete_by_compaction/main.cu diff --git a/unit_tests/custom_allocator/unit.cu b/unit_tests/custom_allocator/unit.cu new file mode 100644 index 0000000..9c2a2b9 --- /dev/null +++ b/unit_tests/custom_allocator/unit.cu @@ -0,0 +1,160 @@ +#include +#include +#include +#include +#include +#include +#include "../../include/splitvector/splitvec.h" +#include "../../include/splitvector/split_tools.h" +#include "../../include/common.h" +#include "../../include/splitvector/archMacros.h" +#define expect_true EXPECT_TRUE +#define expect_false EXPECT_FALSE +#define expect_eq EXPECT_EQ +#define TARGET 1 + +/** + * @brief Custom allocator for unified memory (GPU and CPU accessible). + * + * This class provides an allocator for unified memory, which can be accessed + * by both the GPU and the CPU. It allocates and deallocates memory using split_gpuMallocManaged + * and split_gpuFree functions, while also providing constructors and destructors for objects. + * + * @tparam T Type of the allocated objects. + */ +template +class customAllocator { +public: + typedef T value_type; + typedef value_type* pointer; + typedef const value_type* const_pointer; + typedef value_type& reference; + typedef const value_type& const_reference; + typedef ptrdiff_t difference_type; + typedef size_t size_type; + template + struct rebind { + typedef customAllocator other; + }; + /** + * @brief Default constructor. + */ + customAllocator() throw() {} + + /** + * @brief Copy constructor with different type. + */ + template + customAllocator(customAllocator const&) throw() {} + pointer address(reference x) const { return &x; } + const_pointer address(const_reference x) const { return &x; } + + pointer allocate(size_type n, const void* /*hint*/ = 0) { + T* ret; + assert(n && "allocate 0"); + SPLIT_CHECK_ERR(split_gpuMallocManaged((void**)&ret, n * sizeof(value_type))); + if (ret == nullptr) { + throw std::bad_alloc(); + } + return ret; + } + + static void* allocate_raw(size_type n, const void* /*hint*/ = 0) { + void* ret; + SPLIT_CHECK_ERR(split_gpuMallocManaged((void**)&ret, n)); + if (ret == nullptr) { + throw std::bad_alloc(); + } + return ret; + } + + void deallocate(pointer p, size_type n) { + if (n != 0 && p != 0) { + SPLIT_CHECK_ERR(split_gpuFree(p)); + } + } + static void deallocate(void* p, size_type n) { + if (n != 0 && p != 0) { + SPLIT_CHECK_ERR(split_gpuFree(p)); + } + } + + size_type max_size() const throw() { + size_type max = static_cast(-1) / sizeof(value_type); + return (max > 0 ? max : 1); + } + + template + __host__ __device__ void construct(U* p, Args&&... args) { + ::new (p) U(std::forward(args)...); + } + + void destroy(pointer p) { p->~value_type(); } +}; + +typedef uint32_t int_type ; +typedef struct{ + int_type num; + int_type flag; +} test_t; +typedef split::SplitVector vector; +typedef split::SplitVector> customAllocatorVector; +size_t count = 0; + +void fill_vec(vector& v, size_t targetSize){ + count=0; + size_t st=0; + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution dist(1, std::numeric_limits::max()); + v.clear(); + while (v.size() < targetSize) { + int_type val =++st;// dist(gen); + v.push_back(test_t{val,(val%2==0)}); + if (val%2 == 0){count++;}; + } +} + +bool checkFlags(const customAllocatorVector& v,const int_type target){ + for (const auto& i:v){ + if (i.flag!=target){return false;} + } + return true; +} + +bool run_test_small_loop_variant(size_t size){ + // std::cout<<"Testing with vector size: "<bool{ return element.flag == 1 ;}; + auto predicate_off =[]__host__ __device__ (test_t element)->bool{ return element.flag == 0 ;}; + customAllocatorVector* output1=new customAllocatorVector(nextPow2(2*v->size())); + customAllocatorVector* output2=new customAllocatorVector(nextPow2(2*v->size())); + + split::tools::copy_if_loop(*v,*output1,predicate_on); + split::tools::copy_if_loop(*v,*output2,predicate_off); + SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() ); + + bool sane1 = checkFlags(*output1,1); + bool sane2 = checkFlags(*output2,0); + bool sane3 = ((output1->size()+output2->size())==v->size()); + bool sane4 =( output1->size() ==count ); + bool sane5 = ( output2->size() ==v->size()-count ); + // printf( " %d - %d - %d - %d - %d\n",sane1,sane2,sane3,sane4,sane5 ); + bool retval = sane1 && sane2 && sane3 && sane4 && sane5; + return retval; +} + +TEST(StremCompaction , Compaction_Tests_Linear_Loop_Variant){ + for (size_t s=32; s< 1024; s++ ){ + bool a = run_test_small_loop_variant(s); + expect_true(a); + } + +} + +int main(int argc, char* argv[]){ + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +}