Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reductions/19.x #61

Open
wants to merge 14 commits into
base: dev/19.x
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions clang/include/clang/Basic/Attr.td
Original file line number Diff line number Diff line change
Expand Up @@ -4725,6 +4725,14 @@ def ClspvLibclcBuiltin: InheritableAttr {
let SimpleHandler = 1;
}

// +===== kitsune-/tapir-centric attributes

def KitsuneReduction : InheritableAttr {
let Spellings = [Clang<"kitsune_reduction">];
let Subjects = SubjectList<[FunctionLike]>;
let Documentation = [StrandMallocDocs];
}

def TapirTarget : StmtAttr {
let Spellings = [CXX11<"tapir","target">];

Expand Down
3 changes: 3 additions & 0 deletions clang/lib/CodeGen/CGCall.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2408,6 +2408,9 @@ void CodeGenModule::ConstructAttributeList(StringRef Name,
FuncAttrs.addMemoryAttr(llvm::MemoryEffects::inaccessibleOrArgMemOnly());
FuncAttrs.addAttribute(llvm::Attribute::NoUnwind);
}
if (TargetDecl->hasAttr<KitsuneReductionAttr>()) {
FuncAttrs.addAttribute(llvm::Attribute::KitsuneReduction);
}
if (TargetDecl->hasAttr<RestrictAttr>())
RetAttrs.addAttribute(llvm::Attribute::NoAlias);
if (TargetDecl->hasAttr<ReturnsNonNullAttr>() &&
Expand Down
145 changes: 145 additions & 0 deletions clang/lib/Headers/kitsune.h.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@

/*
* Copyright (c) 2020 Triad National Security, LLC
* All rights reserved.
*
* This file is part of the kitsune/llvm project. It is released under
* the LLVM license.
*/
#ifndef __CLANG_KITSUNE_H__
#define __CLANG_KITSUNE_H__

#include <stdint.h>
#include <stddef.h>

#cmakedefine01 KITSUNE_ENABLE_OPENMP_ABI_TARGET
#cmakedefine01 KITSUNE_ENABLE_QTHREADS_ABI_TARGET
#cmakedefine01 KITSUNE_ENABLE_CUDA_ABI_TARGET
#cmakedefine01 KITSUNE_ENABLE_GPU_ABI_TARGET
#cmakedefine01 KITSUNE_ENABLE_REALM_ABI_TARGET
#cmakedefine01 KITSUNE_ENABLE_OPENCL_ABI_TARGET
#cmakedefine01 KITSUNE_ENABLE_HIP_ABI_TARGET

#include "kitsune_rt.h"

#if defined(reduction)
#warning found reduction definition: try puttin kitsune lower in include order
#else
#define reduction __attribute__((noinline, kitsune_reduction))
#endif


#if defined(KITSUNE_ENABLE_OPENCL_ABI_TARGET)
#define ocl_mmap(a, n) __kitsune_opencl_mmap_marker((void*)a, n)
#ifdef __cplusplus
extern "C" {
#endif
void __kitsune_opencl_mmap_marker(void* ptr, uint64_t n);
#ifdef __cplusplus
}
#endif
#endif

#if defined(spawn)
#warning encountered multiple definitions of spawn!
#else
#define spawn _kitsune_spawn
#endif

#if defined(sync)
#warning encountered multiple definitions of sync!
#else
#define sync _kitsune_sync
#endif

#if defined(forall)
#warning encountered multiple definitions of forall!
#else
#define forall _kitsune_forall
#endif


#if defined(_tapir_cuda_target)
#ifdef __cplusplus
extern "C" __attribute__((malloc)) void* __kitrt_cuMemAllocManaged(size_t);
template <typename T>
inline __attribute__((always_inline))
T* alloc(size_t N) {
return (T*)__kitrt_cuMemAllocManaged(sizeof(T) * N);
}

extern "C" void __kitrt_cuMemFree(void*);
template <typename T>
void dealloc(T* array) {
__kitrt_cuMemFree((void*)array);
}
#else
void* __attribute__((malloc)) __kitrt_cuMemAllocManaged(size_t);
inline __attribute__((always_inline))
void *alloc(size_t total_bytes) {
return __kitrt_cuMemAllocManaged(total_bytes);
}

void __kitrt_cuMemFree(void*);
inline __attribute__((always_inline))
void dealloc(void *array) {
__kitrt_cuMemFree(array);
}
#endif
#elif defined(_tapir_hip_target)
#ifdef __cplusplus
extern "C" __attribute__((malloc)) void* __kitrt_hipMemAllocManaged(size_t);
template <typename T>
inline __attribute__((always_inline))
T* alloc(size_t N) {
return (T*)__kitrt_hipMemAllocManaged(sizeof(T) * N);
}

extern "C" void __kitrt_hipMemFree(void*);
template <typename T>
void dealloc(T* array) {
__kitrt_hipMemFree((void*)array);
}
#else
void* __attribute__((malloc)) __kitrt_hipMemAllocManaged(size_t);
inline __attribute__((always_inline))
void *alloc(size_t total_bytes) {
return __kitrt_hipMemAllocManaged(total_bytes);
}

void __kitrt_hipMemFree(void*);
inline __attribute__((always_inline))
void dealloc(void *array) {
__kitrt_hipMemFree(array);
}
#endif
#else
#ifdef __cplusplus
extern "C" __attribute__((malloc)) void* __kitrt_defaultMemAlloc(size_t);
template <typename T>
inline __attribute__((always_inline))
T* alloc(size_t N) {
return (T*)__kitrt_defaultMemAlloc(sizeof(T) * N);
}

extern "C" void __kitrt_defaultMemFree(void*);
template <typename T>
void dealloc(T* array) {
__kitrt_defaultMemFree(array);
}
#else
void* __attribute__((malloc)) __kitrt_defaultMemAlloc(size_t);
inline __attribute__((always_inline))
void *alloc(size_t total_bytes) {
return __kitrt_defaultMemAlloc(total_bytes);
}

void __kitrt_defaultMemFree(void*);
inline __attribute__((always_inline))
void dealloc(void* array) {
__kitrt_defaultMemFree(array);
}
#endif // __cplusplus
#endif // cpu targets

#endif
43 changes: 43 additions & 0 deletions clang/lib/Headers/magma.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#include<string>
#include<limits>

template <typename a>
struct Magma {
virtual a op(a x, a y) = 0;
};

template <typename a>
struct UnitalMagma : public Magma<a> {
virtual a id() = 0;
};

// Example unital magmas
template <typename a>
struct Sum : UnitalMagma<a>{
a op(a x, a y){ return x + y; }
a id(){ return 0; } // look into this more
};

template <typename a>
struct Product : UnitalMagma<a> {
a op(a x, a y){ return x * y; }
a id(){ return 1; }
};

struct StringApp : UnitalMagma<std::string> {
std::string op(std::string x, std::string y){ return x.append(y); }
std::string id() { return ""; }
};

template <typename a>
struct Max : UnitalMagma<a> {
a op(a x, a y){ return x > y ? x : y; }
a id() { return std::numeric_limits<a>::min(); }
};

template <typename a>
struct Min : UnitalMagma<a> {
a op(a x, a y){ return x < y ? x : y; }
a id() { return std::numeric_limits<a>::max(); }
};

56 changes: 56 additions & 0 deletions clang/lib/Headers/reductions.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#include"magma.h"
#include<assert.h>
#include<unistd.h>
#include<stdio.h>
//#include<kitsune.h>

template <typename a, typename um, typename v>
a reduce(um m, v& xs){
auto acc = m.id();
for(auto x : xs){
acc = m.op(acc, x);
}
return acc;
}

template <typename a, typename um, typename v>
a parReduce(um m, v& xs, uint64_t nthreads){
uint64_t linesize = sysconf(_SC_LEVEL1_DCACHE_LINESIZE);
assert(linesize % sizeof(a) == 0);
uint64_t linenum = linesize / sizeof(a);
a* accs = new a[nthreads * linenum];
uint64_t size = xs.end() - xs.begin();
assert(size % nthreads == 0);
uint64_t grainsize = size / nthreads;
for(uint64_t i=0; i<nthreads; i++){
accs[i*linenum] = m.id();
for(uint64_t j = i*grainsize; j<(i+1)*grainsize; j++){
accs[i*linenum] = m.op(accs[i*linenum], xs[j]);
}
}
a acc = m.id();
for(uint64_t i=0; i<nthreads; i++){
acc = m.op(acc, accs[i*linenum]);
}
delete[] accs;
return acc;
}

template<typename a, typename um, typename v>
a treeReduce(um m, v& xs, uint64_t start, uint64_t end, uint64_t gs){
if(end-start < gs){
a acc = m.id();
for(uint64_t i=start; i<end; i++){
acc+=xs[i];
}
return acc;
}
else{
uint64_t mid = (start + end) / 2;
return treeReduce(m, xs, start, mid, gs)
+ treeReduce(m, xs, mid, end, gs);
}
}



4 changes: 3 additions & 1 deletion clang/lib/Sema/SemaDeclAttr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7120,10 +7120,12 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
case ParsedAttr::AT_TypeNullable:
handleNullableTypeAttr(S, D, AL);
break;

case ParsedAttr::AT_VTablePointerAuthentication:
handleVTablePointerAuthentication(S, D, AL);
break;
case ParsedAttr::AT_KitsuneReduction:
handleSimpleAttribute<KitsuneReductionAttr>(S, D, AL);
break;
}
}

Expand Down
53 changes: 53 additions & 0 deletions kitsune-tests/reductions/l2.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#include<time.h>
#include<math.h>
#include<stdio.h>
#include<stdlib.h>
#include<kitsune.h>
#include<omp.h>
#include<gpu.h>

reduction
void sum(double *a, double b){
*a += b;
}

__attribute__((noinline))
double l2(uint64_t n, double* a){
double red = 0;
forall(uint64_t i=0; i<n; i++){
sum(&red, x);
}

return sqrt(red);
}

int main(int argc, char** argv){
int e = argc > 1 ? atoi(argv[1]) : 28;
int niter = argc > 2 ? atoi(argv[2]) : 100;
uint64_t n = 1ULL<<e;
double* arr = (double*)gpuManagedMalloc(sizeof(double) * n);

forall(uint64_t i=0; i<n; i++){
arr[i] = (double)i;
}

printf("result: %f \n", l2(n, arr));

double res[niter];
double before = omp_get_wtime();
for(int i=0; i<niter; i++){
double red = l2(n, arr);
//printf("red: %f\n", red);
res[i]=red;
}
double after = omp_get_wtime();

double partime = (double)(after - before);
double bw = (double)((1ULL<<e) * niter * sizeof(double)) / (1000000000.0 * partime);
printf("bandwidth: %f GB/s \n" , bw);

//double time = (double)(after - before) / 1000000;
//double bw = (double)((1ULL<<e) * niter * sizeof(double)) / (1000000000.0 * time);
//printf("bandwidth: %f GB/s \n" , bw);
}

Loading