Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 8 additions & 7 deletions include/hashinator/hashinator.h
Original file line number Diff line number Diff line change
Expand Up @@ -1183,11 +1183,11 @@ class Hashmap {
buckets, elements, rule, nBlocks, mPool, s);
return retval;
}
template <typename Rule>
void extractPatternLoop(split::SplitVector<hash_pair<KEY_TYPE, VAL_TYPE>>& elements, Rule rule,
template <typename Rule, typename ALLOCATOR = split::split_unified_allocator<hash_pair<KEY_TYPE, VAL_TYPE>>>
void extractPatternLoop(split::SplitVector<hash_pair<KEY_TYPE, VAL_TYPE>, ALLOCATOR>& elements, Rule rule,
split_gpuStream_t s = 0) {
// Extract elements matching the Pattern Rule(element)==true;
split::tools::copy_if_loop<hash_pair<KEY_TYPE, VAL_TYPE>, Rule, defaults::MAX_BLOCKSIZE, defaults::WARPSIZE>(
split::tools::copy_if_loop<hash_pair<KEY_TYPE, VAL_TYPE>, Rule, ALLOCATOR, defaults::MAX_BLOCKSIZE, defaults::WARPSIZE>(
*device_buckets, elements, rule, s);
}
void extractLoop(split::SplitVector<hash_pair<KEY_TYPE, VAL_TYPE>>& elements, split_gpuStream_t s = 0) {
Expand Down Expand Up @@ -1226,10 +1226,10 @@ class Hashmap {
defaults::WARPSIZE>(buckets, elements, rule, stack, max_size, s);
return elements.size();
}
template <typename Rule>
void extractKeysByPatternLoop(split::SplitVector<KEY_TYPE>& elements, Rule rule, split_gpuStream_t s = 0) {
template <typename Rule, typename ALLOCATOR = split::split_unified_allocator<hash_pair<KEY_TYPE, VAL_TYPE>>>
void extractKeysByPatternLoop(split::SplitVector<KEY_TYPE, ALLOCATOR>& elements, Rule rule, split_gpuStream_t s = 0) {
// Extract element **keys** matching the Pattern Rule(element)==true;
split::tools::copy_if_keys_loop<hash_pair<KEY_TYPE, VAL_TYPE>, KEY_TYPE, Rule, defaults::MAX_BLOCKSIZE,
split::tools::copy_if_keys_loop<hash_pair<KEY_TYPE, VAL_TYPE>, KEY_TYPE, Rule, ALLOCATOR, defaults::MAX_BLOCKSIZE,
defaults::WARPSIZE>(*device_buckets, elements, rule, s);
}

Expand All @@ -1250,7 +1250,8 @@ class Hashmap {
};
return extractKeysByPattern<prefetches>(elements, rule, stack, max_size, s);
}
void extractAllKeysLoop(split::SplitVector<KEY_TYPE>& elements, split_gpuStream_t s = 0) {
template <typename ALLOCATOR = split::split_unified_allocator<hash_pair<KEY_TYPE, VAL_TYPE>>>
void extractAllKeysLoop(split::SplitVector<KEY_TYPE, ALLOCATOR>& elements, split_gpuStream_t s = 0) {
// Extract all keys
auto rule = [] __host__ __device__(const hash_pair<KEY_TYPE, VAL_TYPE>& kval) -> bool {
return kval.first != EMPTYBUCKET && kval.first != TOMBSTONE;
Expand Down
39 changes: 35 additions & 4 deletions include/splitvector/split_tools.h
Original file line number Diff line number Diff line change
Expand Up @@ -715,9 +715,9 @@ __global__ void block_compact_keys(T* input, U* output, size_t inputSize, Rule r
}
}

template <typename T, typename Rule, size_t BLOCKSIZE = 1024>
template <typename T, typename Rule, typename ALLOCATOR, size_t BLOCKSIZE = 1024>
__global__ void loop_compact(split::SplitVector<T, split::split_unified_allocator<T>>& inputVec,
split::SplitVector<T, split::split_unified_allocator<T>>& outputVec, Rule rule) {
split::SplitVector<T, ALLOCATOR>& outputVec, Rule rule) {
// This must be equal to at least both WARPLENGTH and MAX_BLOCKSIZE/WARPLENGTH
__shared__ uint32_t warpSums[WARPLENGTH];
__shared__ uint32_t outputCount;
Expand Down Expand Up @@ -797,9 +797,9 @@ __global__ void loop_compact(split::SplitVector<T, split::split_unified_allocato
outputVec.device_resize(outputSize);
}
}
template <typename T, typename U, typename Rule, size_t BLOCKSIZE = 1024>
template <typename T, typename U, typename Rule, typename ALLOCATOR, size_t BLOCKSIZE = 1024>
__global__ void loop_compact_keys(split::SplitVector<T, split::split_unified_allocator<T>>& inputVec,
split::SplitVector<U, split::split_unified_allocator<U>>& outputVec, Rule rule) {
split::SplitVector<U, ALLOCATOR>& outputVec, Rule rule) {
// This must be equal to at least both WARPLENGTH and MAX_BLOCKSIZE/WARPLENGTH
__shared__ uint32_t warpSums[WARPLENGTH];
__shared__ uint32_t outputCount;
Expand Down Expand Up @@ -988,6 +988,24 @@ uint32_t copy_if_raw(T* input, T* output, size_t size, Rule rule, size_t nBlocks
return numel;
}

/**
* @brief Extraction routines using just a single block.
These methods assume splitvectors are fully allocated on UM or Device.
*/

template <typename T, typename Rule, typename ALLOCATOR, size_t BLOCKSIZE = 1024, size_t WARP = WARPLENGTH>
void copy_if_loop(split::SplitVector<T, split::split_unified_allocator<T>>& input,
split::SplitVector<T, ALLOCATOR>& output, Rule rule,
split_gpuStream_t s = 0) {
#ifdef HASHINATOR_DEBUG
bool input_ok = isDeviceAccessible(reinterpret_cast<void*>(&input));
bool output_ok = isDeviceAccessible(reinterpret_cast<void*>(&output));
assert((input_ok && output_ok) &&
"This method supports splitvectors dynamically allocated on device or unified memory!");
#endif
split::tools::loop_compact<<<1, BLOCKSIZE, 0, s>>>(input, output, rule);
}

/**
* @brief Extraction routines using just a single block.
These methods assume splitvectors are fully allocated on UM or Device.
Expand All @@ -1006,6 +1024,19 @@ void copy_if_loop(split::SplitVector<T, split::split_unified_allocator<T>>& inpu
split::tools::loop_compact<<<1, BLOCKSIZE, 0, s>>>(input, output, rule);
}

template <typename T, typename U, typename Rule, typename ALLOCATOR, size_t BLOCKSIZE = 1024, size_t WARP = WARPLENGTH>
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same question for all these pretty much

Copy link
Author

@MikaelHuppunen MikaelHuppunen Feb 2, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The functions would then break if ALLOCATOR is defined but BLOCKSIZE and WARP are not (or the other way around depending on the order of the arguments).

void copy_if_keys_loop(split::SplitVector<T, split::split_unified_allocator<T>>& input,
split::SplitVector<U, ALLOCATOR>& output, Rule rule,
split_gpuStream_t s = 0) {
#ifdef HASHINATOR_DEBUG
bool input_ok = isDeviceAccessible(reinterpret_cast<void*>(&input));
bool output_ok = isDeviceAccessible(reinterpret_cast<void*>(&output));
assert((input_ok && output_ok) &&
"This method supports splitvectors dynamically allocated on device or unified memory!");
#endif
split::tools::loop_compact_keys<<<1, BLOCKSIZE, 0, s>>>(input, output, rule);
}

template <typename T, typename U, typename Rule, size_t BLOCKSIZE = 1024, size_t WARP = WARPLENGTH>
void copy_if_keys_loop(split::SplitVector<T, split::split_unified_allocator<T>>& input,
split::SplitVector<U, split::split_unified_allocator<U>>& output, Rule rule,
Expand Down
5 changes: 4 additions & 1 deletion unit_tests/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ EXTRA= --std=c++17
EXTRA+= -gencode arch=compute_60,code=sm_60
EXTRA+= -DHASHMAPDEBUG --expt-relaxed-constexpr --expt-extended-lambda -lpthread
GTEST= -L/home/kstppd/libs/googletest/build/lib -I/home/kstppd/libs/googletest/googletest/include -lgtest -lgtest_main -lpthread
OBJ= gtest_vec_host.o gtest_vec_device.o gtest_hashmap.o stream_compaction.o stream_compaction2.o delete_mechanism.o insertion_mechanism.o hybrid_cpu.o hybrid_gpu.o pointer_test.o benchmark.o benchmarkLF.o tbPerf.o realistic.o preallocated.o
OBJ= gtest_vec_host.o gtest_vec_device.o gtest_hashmap.o stream_compaction.o stream_compaction2.o custom_allocator.o delete_mechanism.o insertion_mechanism.o hybrid_cpu.o hybrid_gpu.o pointer_test.o benchmark.o benchmarkLF.o tbPerf.o realistic.o preallocated.o


default: tests
Expand Down Expand Up @@ -63,6 +63,9 @@ preallocated.o: stream_compaction/preallocated.cu
stream_compaction2.o: stream_compaction/unit.cu
${CC} ${CXXFLAGS} ${OPT} ${EXTRA} ${GTEST} -o compaction2 stream_compaction/unit.cu

custom_allocator.o: custom_allocator/unit.cu
${CC} ${CXXFLAGS} ${OPT} ${EXTRA} ${GTEST} -o compaction2 stream_compaction/unit.cu

delete_mechanism.o: delete_by_compaction/main.cu
${CC} ${CXXFLAGS} ${OPT} ${EXTRA} ${GTEST} -o delete_mechanism delete_by_compaction/main.cu

Expand Down
160 changes: 160 additions & 0 deletions unit_tests/custom_allocator/unit.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
#include <iostream>
#include <stdlib.h>
#include <chrono>
#include <limits>
#include <random>
#include <gtest/gtest.h>
#include "../../include/splitvector/splitvec.h"
#include "../../include/splitvector/split_tools.h"
#include "../../include/common.h"
#include "../../include/splitvector/archMacros.h"
#define expect_true EXPECT_TRUE
#define expect_false EXPECT_FALSE
#define expect_eq EXPECT_EQ
#define TARGET 1

/**
* @brief Custom allocator for unified memory (GPU and CPU accessible).
*
* This class provides an allocator for unified memory, which can be accessed
* by both the GPU and the CPU. It allocates and deallocates memory using split_gpuMallocManaged
* and split_gpuFree functions, while also providing constructors and destructors for objects.
*
* @tparam T Type of the allocated objects.
*/
template <class T>
class customAllocator {
public:
typedef T value_type;
typedef value_type* pointer;
typedef const value_type* const_pointer;
typedef value_type& reference;
typedef const value_type& const_reference;
typedef ptrdiff_t difference_type;
typedef size_t size_type;
template <class U>
struct rebind {
typedef customAllocator<U> other;
};
/**
* @brief Default constructor.
*/
customAllocator() throw() {}

/**
* @brief Copy constructor with different type.
*/
template <class U>
customAllocator(customAllocator<U> const&) throw() {}
pointer address(reference x) const { return &x; }
const_pointer address(const_reference x) const { return &x; }

pointer allocate(size_type n, const void* /*hint*/ = 0) {
T* ret;
assert(n && "allocate 0");
SPLIT_CHECK_ERR(split_gpuMallocManaged((void**)&ret, n * sizeof(value_type)));
if (ret == nullptr) {
throw std::bad_alloc();
}
return ret;
}

static void* allocate_raw(size_type n, const void* /*hint*/ = 0) {
void* ret;
SPLIT_CHECK_ERR(split_gpuMallocManaged((void**)&ret, n));
if (ret == nullptr) {
throw std::bad_alloc();
}
return ret;
}

void deallocate(pointer p, size_type n) {
if (n != 0 && p != 0) {
SPLIT_CHECK_ERR(split_gpuFree(p));
}
}
static void deallocate(void* p, size_type n) {
if (n != 0 && p != 0) {
SPLIT_CHECK_ERR(split_gpuFree(p));
}
}

size_type max_size() const throw() {
size_type max = static_cast<size_type>(-1) / sizeof(value_type);
return (max > 0 ? max : 1);
}

template <typename U, typename... Args>
__host__ __device__ void construct(U* p, Args&&... args) {
::new (p) U(std::forward<Args>(args)...);
}

void destroy(pointer p) { p->~value_type(); }
};

typedef uint32_t int_type ;
typedef struct{
int_type num;
int_type flag;
} test_t;
typedef split::SplitVector<test_t> vector;
typedef split::SplitVector<test_t,customAllocator<test_t>> customAllocatorVector;
size_t count = 0;

void fill_vec(vector& v, size_t targetSize){
count=0;
size_t st=0;
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<int_type> dist(1, std::numeric_limits<int_type>::max());
v.clear();
while (v.size() < targetSize) {
int_type val =++st;// dist(gen);
v.push_back(test_t{val,(val%2==0)});
if (val%2 == 0){count++;};
}
}

bool checkFlags(const customAllocatorVector& v,const int_type target){
for (const auto& i:v){
if (i.flag!=target){return false;}
}
return true;
}

bool run_test_small_loop_variant(size_t size){
// std::cout<<"Testing with vector size: "<<size<<std::endl;
vector* v=new vector();
fill_vec(*v,size);

auto predicate_on =[]__host__ __device__ (test_t element)->bool{ return element.flag == 1 ;};
auto predicate_off =[]__host__ __device__ (test_t element)->bool{ return element.flag == 0 ;};
customAllocatorVector* output1=new customAllocatorVector(nextPow2(2*v->size()));
customAllocatorVector* output2=new customAllocatorVector(nextPow2(2*v->size()));

split::tools::copy_if_loop(*v,*output1,predicate_on);
split::tools::copy_if_loop(*v,*output2,predicate_off);
SPLIT_CHECK_ERR( split_gpuDeviceSynchronize() );

bool sane1 = checkFlags(*output1,1);
bool sane2 = checkFlags(*output2,0);
bool sane3 = ((output1->size()+output2->size())==v->size());
bool sane4 =( output1->size() ==count );
bool sane5 = ( output2->size() ==v->size()-count );
// printf( " %d - %d - %d - %d - %d\n",sane1,sane2,sane3,sane4,sane5 );
bool retval = sane1 && sane2 && sane3 && sane4 && sane5;
return retval;
}

TEST(StremCompaction , Compaction_Tests_Linear_Loop_Variant){
for (size_t s=32; s< 1024; s++ ){
bool a = run_test_small_loop_variant(s);
expect_true(a);
}

}

int main(int argc, char* argv[]){
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}