CUDA Stream support (#213)

wsmoses · web-flow · commit 2b123e0fc413 · 2022-05-19T01:39:40.000+02:00
* CUDA Stream support

* Async lowering [WIP]

* Fix lowering to moccuda

* Convert to malloc/free

* Fix non-async

* Update LLVM

* Fix build
diff --git a/include/polygeist/PolygeistOps.td b/include/polygeist/PolygeistOps.td
@@ -65,6 +65,16 @@ def SubIndexOp : Polygeist_Op<"subindex", [
   }];
 }
 
+
+def StreamToTokenOp : Polygeist_Op<"stream2token", [
+  NoSideEffect
+]> {
+  let summary = "Extract an async stream from a cuda stream";
+
+  let arguments = (ins AnyType : $source);
+  let results = (outs AnyType : $result);
+}
+
 //===----------------------------------------------------------------------===//
 // Memref2PointerOp
 //===----------------------------------------------------------------------===//
diff --git a/lib/polygeist/Passes/CMakeLists.txt b/lib/polygeist/Passes/CMakeLists.txt
@@ -23,6 +23,7 @@ add_mlir_dialect_library(MLIRPolygeistTransforms
 
   LINK_LIBS PUBLIC
   MLIRAffine
+  MLIRAsync
   MLIRAffineUtils
   MLIRFunc
   MLIRFuncTransforms
diff --git a/lib/polygeist/Passes/ConvertPolygeistToLLVM.cpp b/lib/polygeist/Passes/ConvertPolygeistToLLVM.cpp
diff --git a/lib/polygeist/Passes/ParallelLower.cpp b/lib/polygeist/Passes/ParallelLower.cpp
@@ -14,6 +14,7 @@
 #include "mlir/Analysis/CallGraph.h"
 #include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Async/IR/Async.h"
 #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/GPU/GPUDialect.h"
@@ -274,6 +275,21 @@ void ParallelLower::runOnOperation() {
 
     auto oneindex = builder.create<ConstantIndexOp>(loc, 1);
 
+    async::ExecuteOp asyncOp = nullptr;
+    if (!llvm::empty(launchOp.asyncDependencies())) {
+      SmallVector<Value> dependencies;
+      for (auto v : launchOp.asyncDependencies()) {
+        auto tok = v.getDefiningOp<polygeist::StreamToTokenOp>();
+        dependencies.push_back(builder.create<polygeist::StreamToTokenOp>(
+            tok.getLoc(), builder.getType<async::TokenType>(), tok.source()));
+      }
+      asyncOp = builder.create<mlir::async::ExecuteOp>(
+          loc, /*results*/ TypeRange(), /*dependencies*/ dependencies,
+          /*operands*/ ValueRange());
+      Block *blockB = &asyncOp.body().front();
+      builder.setInsertionPointToStart(blockB);
+    }
+
     auto block = builder.create<mlir::scf::ParallelOp>(
         loc, std::vector<Value>({zindex, zindex, zindex}),
         std::vector<Value>(
diff --git a/llvm-project b/llvm-project
@@ -1 +1 @@
-Subproject commit 08ac66124874d70dab63c731da0244f9e29ef168
+Subproject commit 00a12585933ef63ff1204bf5cd265f0071d04642
diff --git a/tools/mlir-clang/Lib/CGCall.cc b/tools/mlir-clang/Lib/CGCall.cc
@@ -296,9 +296,22 @@ ValueCategory MLIRScanner::CallHelper(
                     val, idx)));
       }
     }
-    auto op = builder.create<mlir::gpu::LaunchOp>(loc, blocks[0], blocks[1],
-                                                  blocks[2], threads[0],
-                                                  threads[1], threads[2]);
+    mlir::Value stream = nullptr;
+    SmallVector<mlir::Value, 1> asyncDependencies;
+    if (3 < CU->getConfig()->getNumArgs() &&
+        !isa<CXXDefaultArgExpr>(CU->getConfig()->getArg(3))) {
+      stream = Visit(CU->getConfig()->getArg(3)).getValue(builder);
+      stream = builder.create<polygeist::StreamToTokenOp>(
+          loc, builder.getType<gpu::AsyncTokenType>(), stream);
+      assert(stream);
+      asyncDependencies.push_back(stream);
+    }
+    auto op = builder.create<mlir::gpu::LaunchOp>(
+        loc, blocks[0], blocks[1], blocks[2], threads[0], threads[1],
+        threads[2],
+        /*dynamic shmem size*/ nullptr,
+        /*token type*/ stream ? stream.getType() : nullptr,
+        /*dependencies*/ asyncDependencies);
     auto oldpoint = builder.getInsertionPoint();
     auto *oldblock = builder.getInsertionBlock();
     builder.setInsertionPointToStart(&op.getRegion().front());
diff --git a/tools/mlir-clang/Lib/clang-mlir.cc b/tools/mlir-clang/Lib/clang-mlir.cc
@@ -718,8 +718,7 @@ mlir::Attribute MLIRScanner::InitializeValueByInitListExpr(mlir::Value toInit,
         return mlir::DenseElementsAttr();
       if (auto mt = toInit.getType().dyn_cast<MemRefType>()) {
         return DenseElementsAttr::getFromRawBuffer(
-            RankedTensorType::get(mt.getShape(), mt.getElementType()), attrs,
-            false);
+            RankedTensorType::get(mt.getShape(), mt.getElementType()), attrs);
       }
       return mlir::DenseElementsAttr();
     } else {
diff --git a/tools/mlir-clang/Test/Verification/stream.cu b/tools/mlir-clang/Test/Verification/stream.cu
@@ -0,0 +1,26 @@
+// RUN: mlir-clang %s --cuda-gpu-arch=sm_60 -nocudalib -nocudainc %resourcedir --function=* -S | FileCheck %s
+
+#include "Inputs/cuda.h"
+
+__device__ void something(int* array, int n);
+
+// Type your code here, or load an example.
+__global__ void square(int *array, int n) {
+	something(array, n);
+}
+
+void run(cudaStream_t stream1, int *array, int n) {
+    square<<< 10, 20, 0, stream1>>> (array, n) ;
+}
+
+// CHECK:   func.func @_Z3runP10cudaStreamPii(%arg0: !llvm.ptr<struct<()>>, %arg1: memref<?xi32>, %arg2: i32) attributes {llvm.linkage = #llvm.linkage<external>} {
+// CHECK-NEXT:     %c10 = arith.constant 10 : index
+// CHECK-NEXT:     %c1 = arith.constant 1 : index
+// CHECK-NEXT:     %c20 = arith.constant 20 : index
+// CHECK-NEXT:     %0 = "polygeist.stream2token"(%arg0) : (!llvm.ptr<struct<()>>) -> !gpu.async.token
+// CHECK-NEXT:     %1 = gpu.launch async [%0] blocks(%arg3, %arg4, %arg5) in (%arg9 = %c10, %arg10 = %c1, %arg11 = %c1) threads(%arg6, %arg7, %arg8) in (%arg12 = %c20, %arg13 = %c1, %arg14 = %c1) {
+// CHECK-NEXT:       func.call @_Z21__device_stub__squarePii(%arg1, %arg2) : (memref<?xi32>, i32) -> ()
+// CHECK-NEXT:       gpu.terminator
+// CHECK-NEXT:     }
+// CHECK-NEXT:     return
+// CHECK-NEXT:   }
diff --git a/tools/mlir-clang/Test/Verification/whiletofor.c b/tools/mlir-clang/Test/Verification/whiletofor.c
@@ -22,14 +22,14 @@ void whiletofor() {
 
 // TODO redundant for elim
 // CHECK: func @whiletofor()
-// CHECK-NEXT:     %c7_i32 = arith.constant 7 : i32
-// CHECK-NEXT:     %c0_i32 = arith.constant 0 : i32
-// CHECK-NEXT:     %c20_i32 = arith.constant 20 : i32
-// CHECK-NEXT:     %c2_i32 = arith.constant 2 : i32
-// CHECK-NEXT:     %c3_i32 = arith.constant 3 : i32
-// CHECK-NEXT:     %c1 = arith.constant 1 : index
-// CHECK-NEXT:     %c0 = arith.constant 0 : index
-// CHECK-NEXT:     %c100 = arith.constant 100 : index
+// CHECK-DAG:     %c7_i32 = arith.constant 7 : i32
+// CHECK-DAG:     %c0_i32 = arith.constant 0 : i32
+// CHECK-DAG:     %c20_i32 = arith.constant 20 : i32
+// CHECK-DAG:     %c2_i32 = arith.constant 2 : i32
+// CHECK-DAG:     %c3_i32 = arith.constant 3 : i32
+// CHECK-DAG:     %c1 = arith.constant 1 : index
+// CHECK-DAG:     %c0 = arith.constant 0 : index
+// CHECK-DAG:     %c100 = arith.constant 100 : index
 // CHECK-NEXT:     %0 = memref.alloca() : memref<100x100xi32>
 // CHECK-NEXT:     %1 = scf.for %arg0 = %c0 to %c100 step %c1 iter_args(%arg1 = %c7_i32) -> (i32) {
 // CHECK-NEXT:       %3 = arith.index_cast %arg1 : i32 to index
diff --git a/tools/mlir-clang/Test/canonicalization.c b/tools/mlir-clang/Test/canonicalization.c
@@ -5,10 +5,10 @@
 
 // CHECK-LABEL:   func @matrix_power(
 // CHECK:                       %[[VAL_0:.*]]: memref<20x20xi32>, %[[VAL_1:.*]]: memref<20xi32>, %[[VAL_2:.*]]: memref<20xi32>, %[[VAL_3:.*]]: memref<20xi32>)
-// CHECK-NEXT:     %c1 = arith.constant 1 : index
-// CHECK-NEXT:     %c20 = arith.constant 20 : index
-// CHECK-NEXT:     %c0 = arith.constant 0 : index
-// CHECK-NEXT:     %c-1_i32 = arith.constant -1 : i32
+// CHECK-DAG:     %c1 = arith.constant 1 : index
+// CHECK-DAG:     %c20 = arith.constant 20 : index
+// CHECK-DAG:     %c0 = arith.constant 0 : index
+// CHECK-DAG:     %c-1_i32 = arith.constant -1 : i32
 // CHECK-NEXT:     scf.for %arg4 = %c1 to %c20 step %c1 {
 // CHECK-NEXT:       %0 = arith.index_cast %arg4 : index to i32
 // CHECK-NEXT:       %1 = arith.addi %0, %c-1_i32 : i32
diff --git a/tools/mlir-clang/mlir-clang.cc b/tools/mlir-clang/mlir-clang.cc
@@ -26,6 +26,7 @@
 #include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
 #include "mlir/Conversion/SCFToOpenMP/SCFToOpenMP.h"
 #include "mlir/Dialect/Affine/Passes.h"
+#include "mlir/Dialect/Async/IR/Async.h"
 #include "mlir/Dialect/DLTI/DLTI.h"
 #include "mlir/Dialect/GPU/GPUDialect.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
@@ -418,6 +419,7 @@ int main(int argc, char **argv) {
   context.getOrLoadDialect<func::FuncDialect>();
   context.getOrLoadDialect<DLTIDialect>();
   context.getOrLoadDialect<mlir::scf::SCFDialect>();
+  context.getOrLoadDialect<mlir::async::AsyncDialect>();
   context.getOrLoadDialect<mlir::LLVM::LLVMDialect>();
   context.getOrLoadDialect<mlir::NVVM::NVVMDialect>();
   context.getOrLoadDialect<mlir::gpu::GPUDialect>();
diff --git a/tools/polygeist-opt/polygeist-opt.cpp b/tools/polygeist-opt/polygeist-opt.cpp
@@ -14,6 +14,7 @@
 #include "mlir/Conversion/Passes.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
+#include "mlir/Dialect/Async/IR/Async.h"
 #include "mlir/Dialect/DLTI/DLTI.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/GPU/GPUDialect.h"
@@ -48,6 +49,7 @@ int main(int argc, char **argv) {
   registry.insert<mlir::AffineDialect>();
   registry.insert<mlir::LLVM::LLVMDialect>();
   registry.insert<mlir::memref::MemRefDialect>();
+  registry.insert<mlir::async::AsyncDialect>();
   registry.insert<mlir::func::FuncDialect>();
   registry.insert<mlir::arith::ArithmeticDialect>();
   registry.insert<mlir::scf::SCFDialect>();