lanl · stelleg · Dec 10, 2021 · Dec 10, 2021 · Dec 10, 2021 · Jun 28, 2022
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
@@ -4725,6 +4725,14 @@ def ClspvLibclcBuiltin: InheritableAttr {
   let SimpleHandler = 1;
 }
 
+// +===== kitsune-/tapir-centric attributes
+
+def KitsuneReduction : InheritableAttr {
+  let Spellings = [Clang<"kitsune_reduction">];
+  let Subjects = SubjectList<[FunctionLike]>;
+  let Documentation = [StrandMallocDocs];
+}
+
 def TapirTarget : StmtAttr {
   let Spellings = [CXX11<"tapir","target">];
 

diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
@@ -2408,6 +2408,9 @@ void CodeGenModule::ConstructAttributeList(StringRef Name,
       FuncAttrs.addMemoryAttr(llvm::MemoryEffects::inaccessibleOrArgMemOnly());
       FuncAttrs.addAttribute(llvm::Attribute::NoUnwind);
     }
+    if (TargetDecl->hasAttr<KitsuneReductionAttr>()) {
+      FuncAttrs.addAttribute(llvm::Attribute::KitsuneReduction);
+    }
     if (TargetDecl->hasAttr<RestrictAttr>())
       RetAttrs.addAttribute(llvm::Attribute::NoAlias);
     if (TargetDecl->hasAttr<ReturnsNonNullAttr>() &&

diff --git a/clang/lib/Headers/kitsune.h.cmake b/clang/lib/Headers/kitsune.h.cmake
@@ -0,0 +1,145 @@
+
+/*
+ * Copyright (c) 2020 Triad National Security, LLC
+ *                         All rights reserved.
+ *
+ * This file is part of the kitsune/llvm project.  It is released under
+ * the LLVM license.
+ */
+#ifndef __CLANG_KITSUNE_H__
+#define __CLANG_KITSUNE_H__
+
+#include <stdint.h>
+#include <stddef.h>
+
+#cmakedefine01 KITSUNE_ENABLE_OPENMP_ABI_TARGET
+#cmakedefine01 KITSUNE_ENABLE_QTHREADS_ABI_TARGET
+#cmakedefine01 KITSUNE_ENABLE_CUDA_ABI_TARGET
+#cmakedefine01 KITSUNE_ENABLE_GPU_ABI_TARGET
+#cmakedefine01 KITSUNE_ENABLE_REALM_ABI_TARGET
+#cmakedefine01 KITSUNE_ENABLE_OPENCL_ABI_TARGET
+#cmakedefine01 KITSUNE_ENABLE_HIP_ABI_TARGET
+
+#include "kitsune_rt.h"
+
+#if defined(reduction)
+#warning found reduction definition: try puttin kitsune lower in include order
+#else
+#define reduction __attribute__((noinline, kitsune_reduction))
+#endif
+
+
+#if defined(KITSUNE_ENABLE_OPENCL_ABI_TARGET)
+#define ocl_mmap(a, n) __kitsune_opencl_mmap_marker((void*)a, n)
+#ifdef __cplusplus
+extern "C" {
+#endif
+  void __kitsune_opencl_mmap_marker(void* ptr, uint64_t n);
+#ifdef __cplusplus
+}
+#endif
+#endif
+
+#if defined(spawn)
+#warning encountered multiple definitions of spawn!
+#else
+#define spawn _kitsune_spawn
+#endif
+
+#if defined(sync)
+#warning encountered multiple definitions of sync!
+#else
+#define sync _kitsune_sync
+#endif
+
+#if defined(forall)
+#warning encountered multiple definitions of forall!
+#else
+#define forall _kitsune_forall
+#endif
+
+
+#if defined(_tapir_cuda_target)
+  #ifdef __cplusplus
+    extern "C" __attribute__((malloc)) void* __kitrt_cuMemAllocManaged(size_t);
+    template <typename T>
+    inline __attribute__((always_inline))
+      T* alloc(size_t N) {
+      return (T*)__kitrt_cuMemAllocManaged(sizeof(T) * N);
+    }
+
+    extern "C" void __kitrt_cuMemFree(void*);
+    template <typename T>
+    void dealloc(T* array) {
+      __kitrt_cuMemFree((void*)array);
+    }
+  #else
+    void* __attribute__((malloc)) __kitrt_cuMemAllocManaged(size_t);
+    inline __attribute__((always_inline))
+    void *alloc(size_t total_bytes) {
+      return __kitrt_cuMemAllocManaged(total_bytes);
+    }
+
+    void __kitrt_cuMemFree(void*);
+    inline __attribute__((always_inline))
+    void dealloc(void *array) {
+      __kitrt_cuMemFree(array);
+    }
+  #endif
+#elif defined(_tapir_hip_target)
+  #ifdef __cplusplus
+    extern "C" __attribute__((malloc)) void* __kitrt_hipMemAllocManaged(size_t);
+    template <typename T>
+    inline __attribute__((always_inline))
+      T* alloc(size_t N) {
+      return (T*)__kitrt_hipMemAllocManaged(sizeof(T) * N);
+    }
+
+    extern "C" void __kitrt_hipMemFree(void*);
+    template <typename T>
+    void dealloc(T* array) {
+      __kitrt_hipMemFree((void*)array);
+    }
+  #else
+    void* __attribute__((malloc)) __kitrt_hipMemAllocManaged(size_t);
+    inline __attribute__((always_inline))
+    void *alloc(size_t total_bytes) {
+      return __kitrt_hipMemAllocManaged(total_bytes);
+    }
+
+    void __kitrt_hipMemFree(void*);
+    inline __attribute__((always_inline))
+    void dealloc(void *array) {
+       __kitrt_hipMemFree(array);
+    }
+  #endif
+#else
+  #ifdef __cplusplus
+    extern "C" __attribute__((malloc)) void* __kitrt_defaultMemAlloc(size_t);
+    template <typename T>
+    inline __attribute__((always_inline))
+    T* alloc(size_t N) {
+      return (T*)__kitrt_defaultMemAlloc(sizeof(T) * N);
+    }
+
+    extern "C" void __kitrt_defaultMemFree(void*);
+    template <typename T>
+    void dealloc(T* array) {
+      __kitrt_defaultMemFree(array);
+    }
+  #else
+    void* __attribute__((malloc)) __kitrt_defaultMemAlloc(size_t);
+    inline __attribute__((always_inline))
+    void *alloc(size_t total_bytes) {
+      return __kitrt_defaultMemAlloc(total_bytes);
+    }
+
+    void __kitrt_defaultMemFree(void*);
+    inline __attribute__((always_inline))
+    void dealloc(void* array) {
+       __kitrt_defaultMemFree(array);
+    }
+  #endif // __cplusplus
+#endif // cpu targets
+
+#endif
diff --git a/clang/lib/Headers/magma.h b/clang/lib/Headers/magma.h
@@ -0,0 +1,43 @@
+#include<string>
+#include<limits>
+
+template <typename a> 
+struct Magma {
+  virtual a op(a x, a y) = 0; 
+}; 
+
+template <typename a>
+struct UnitalMagma : public Magma<a> {
+  virtual a id() = 0; 
+}; 
+
+// Example unital magmas
+template <typename a>
+struct Sum : UnitalMagma<a>{
+  a op(a x, a y){ return x + y; }
+  a id(){ return 0; }  // look into this more
+}; 
+
+template <typename a>
+struct Product : UnitalMagma<a> {
+  a op(a x, a y){ return x * y; }
+  a id(){ return 1; }
+}; 
+
+struct StringApp : UnitalMagma<std::string> {
+  std::string op(std::string x, std::string y){ return x.append(y); }
+  std::string id() { return ""; }
+}; 
+
+template <typename a>
+struct Max : UnitalMagma<a> {
+  a op(a x, a y){ return x > y ? x : y; }
+  a id() { return std::numeric_limits<a>::min(); }
+}; 
+
+template <typename a>
+struct Min : UnitalMagma<a> {
+  a op(a x, a y){ return x < y ? x : y; }
+  a id() { return std::numeric_limits<a>::max(); }
+}; 
+
diff --git a/clang/lib/Headers/reductions.h b/clang/lib/Headers/reductions.h
@@ -0,0 +1,56 @@
+#include"magma.h"
+#include<assert.h>
+#include<unistd.h>
+#include<stdio.h>
+//#include<kitsune.h>
+
+template <typename a, typename um, typename v> 
+a reduce(um m, v& xs){
+  auto acc = m.id(); 
+  for(auto x : xs){
+    acc = m.op(acc, x); 
+  }
+  return acc; 
+}
+
+template <typename a, typename um, typename v>
+a parReduce(um m, v& xs, uint64_t nthreads){
+  uint64_t linesize = sysconf(_SC_LEVEL1_DCACHE_LINESIZE); 
+  assert(linesize % sizeof(a) == 0); 
+  uint64_t linenum = linesize / sizeof(a); 
+  a* accs = new a[nthreads * linenum]; 
+  uint64_t size = xs.end() - xs.begin(); 
+  assert(size % nthreads == 0); 
+  uint64_t grainsize = size / nthreads; 
+  for(uint64_t i=0; i<nthreads; i++){
+    accs[i*linenum] = m.id();
+    for(uint64_t j = i*grainsize; j<(i+1)*grainsize; j++){
+      accs[i*linenum] = m.op(accs[i*linenum], xs[j]);
+    }
+  }
+  a acc = m.id(); 
+  for(uint64_t i=0; i<nthreads; i++){
+    acc = m.op(acc, accs[i*linenum]); 
+  }
+  delete[] accs; 
+  return acc; 
+}
+
+template<typename a, typename um, typename v> 
+a treeReduce(um m, v& xs, uint64_t start, uint64_t end, uint64_t gs){
+  if(end-start < gs){
+    a acc = m.id();
+    for(uint64_t i=start; i<end; i++){
+      acc+=xs[i];
+    }
+    return acc; 
+  }
+  else{
+    uint64_t mid = (start + end) / 2; 
+    return treeReduce(m, xs, start, mid, gs) 
+         + treeReduce(m, xs, mid, end, gs); 
+  }
+}
+
+
+
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -7120,10 +7120,12 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
   case ParsedAttr::AT_TypeNullable:
     handleNullableTypeAttr(S, D, AL);
     break;
-
   case ParsedAttr::AT_VTablePointerAuthentication:
     handleVTablePointerAuthentication(S, D, AL);
     break;
+  case ParsedAttr::AT_KitsuneReduction:
+    handleSimpleAttribute<KitsuneReductionAttr>(S, D, AL);
+    break;
   }
 }
 

diff --git a/kitsune-tests/reductions/l2.c b/kitsune-tests/reductions/l2.c
@@ -0,0 +1,53 @@
+#include<time.h>
+#include<math.h>
+#include<stdio.h>
+#include<stdlib.h>
+#include<kitsune.h>
+#include<omp.h>
+#include<gpu.h>
+
+reduction
+void sum(double *a, double b){
+  *a += b;
+}
+
+__attribute__((noinline))
+double l2(uint64_t n, double* a){
+  double red = 0; 
+  forall(uint64_t i=0; i<n; i++){
+    sum(&red, x); 
+  }
+
+  return sqrt(red); 
+}
+
+int main(int argc, char** argv){
+  int e = argc > 1 ? atoi(argv[1]) : 28; 
+  int niter = argc > 2 ? atoi(argv[2]) : 100; 
+  uint64_t n = 1ULL<<e; 
+  double* arr = (double*)gpuManagedMalloc(sizeof(double) * n); 
+
+  forall(uint64_t i=0; i<n; i++){
+    arr[i] = (double)i; 
+  }
+
+  printf("result: %f \n", l2(n, arr));
+
+  double res[niter];
+  double before = omp_get_wtime(); 
+  for(int i=0; i<niter; i++){
+    double red = l2(n, arr); 
+    //printf("red: %f\n", red); 
+    res[i]=red; 
+  }
+  double after = omp_get_wtime(); 
+
+  double partime = (double)(after - before); 
+  double bw = (double)((1ULL<<e) * niter * sizeof(double)) / (1000000000.0 * partime);  
+  printf("bandwidth: %f GB/s \n" , bw);
+
+  //double time = (double)(after - before) / 1000000; 
+  //double bw = (double)((1ULL<<e) * niter * sizeof(double)) / (1000000000.0 * time);  
+  //printf("bandwidth: %f GB/s \n" , bw);
+}
+