Global function code gen fix (#208)

ivanradanov · web-flow · commit f650eae99dcb · 2022-05-10T09:13:52.000+09:00
* Call device stub from host code for global functions

* Update tests

* Don't reverse inputs

* Add a test case for cuda global code gen

* Add nocuda{inc,lib} options and fix cuda test

* clang format

* Add barebones cuda header for cuda tests
diff --git a/tools/mlir-clang/Lib/clang-mlir.cc b/tools/mlir-clang/Lib/clang-mlir.cc
@@ -164,6 +164,15 @@ void MLIRScanner::init(mlir::func::FuncOp function, const FunctionDecl *fd) {
     i++;
   }
 
+  if (fd->hasAttr<CUDAGlobalAttr>() && Glob.CGM.getLangOpts().CUDA &&
+      !Glob.CGM.getLangOpts().CUDAIsDevice) {
+    auto deviceStub =
+        Glob.GetOrCreateMLIRFunction(fd, /* getDeviceStub */ true);
+    builder.create<func::CallOp>(loc, deviceStub, function.getArguments());
+    builder.create<ReturnOp>(loc);
+    return;
+  }
+
   if (auto CC = dyn_cast<CXXConstructorDecl>(fd)) {
     const CXXRecordDecl *ClassDecl = CC->getParent();
     for (auto expr : CC->inits()) {
@@ -4366,14 +4375,18 @@ mlir::Value MLIRASTConsumer::GetOrCreateGlobalLLVMString(
 }
 
 mlir::func::FuncOp
-MLIRASTConsumer::GetOrCreateMLIRFunction(const FunctionDecl *FD) {
+MLIRASTConsumer::GetOrCreateMLIRFunction(const FunctionDecl *FD,
+                                         bool getDeviceStub) {
   assert(FD->getTemplatedKind() !=
          FunctionDecl::TemplatedKind::TK_FunctionTemplate);
   assert(
       FD->getTemplatedKind() !=
       FunctionDecl::TemplatedKind::TK_DependentFunctionTemplateSpecialization);
   std::string name;
-  if (auto CC = dyn_cast<CXXConstructorDecl>(FD))
+  if (getDeviceStub)
+    name =
+        CGM.getMangledName(GlobalDecl(FD, KernelReferenceKind::Kernel)).str();
+  else if (auto CC = dyn_cast<CXXConstructorDecl>(FD))
     name = CGM.getMangledName(GlobalDecl(CC, CXXCtorType::Ctor_Complete)).str();
   else if (auto CC = dyn_cast<CXXDestructorDecl>(FD))
     name = CGM.getMangledName(GlobalDecl(CC, CXXDtorType::Dtor_Complete)).str();
@@ -5279,6 +5292,12 @@ static bool parseMLIR(const char *Argv0, std::vector<std::string> filenames,
   if (Verbose) {
     Argv.push_back("-v");
   }
+  if (NoCUDAInc) {
+    Argv.push_back("-nocudainc");
+  }
+  if (NoCUDALib) {
+    Argv.push_back("-nocudalib");
+  }
   if (CUDAGPUArch != "") {
     auto a = "--cuda-gpu-arch=" + CUDAGPUArch;
     char *chars = (char *)malloc(a.length() + 1);
diff --git a/tools/mlir-clang/Lib/clang-mlir.h b/tools/mlir-clang/Lib/clang-mlir.h
@@ -100,7 +100,8 @@ struct MLIRASTConsumer : public ASTConsumer {
 
   ~MLIRASTConsumer() {}
 
-  mlir::func::FuncOp GetOrCreateMLIRFunction(const FunctionDecl *FD);
+  mlir::func::FuncOp GetOrCreateMLIRFunction(const FunctionDecl *FD,
+                                             bool getDeviceStub = false);
 
   mlir::LLVM::LLVMFuncOp GetOrCreateLLVMFunction(const FunctionDecl *FD);
   mlir::LLVM::LLVMFuncOp GetOrCreateMallocFunction();
diff --git a/tools/mlir-clang/Test/CMakeLists.txt b/tools/mlir-clang/Test/CMakeLists.txt
@@ -1,6 +1,5 @@
 set(MLIR_CLANG_TEST_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 set(MLIR_CLANG_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
-set(CLANG_HEADER_DIR ${LLVM_BUILD_MAIN_SRC_DIR}/../clang/lib/Headers)
 
 configure_lit_site_cfg(
   ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.in
@@ -28,4 +27,4 @@ add_lit_testsuite(check-mlir-clang-single "Running the clang-to-mlir regression
   ARGS -j 1
   )
 
-set_target_properties(check-mlir-clang PROPERTIES FOLDER "clang-to-mlir tests")
+set_target_properties(check-mlir-clang PROPERTIES FOLDER "clang-to-mlir tests")
diff --git a/tools/mlir-clang/Test/Verification/Inputs/cuda.h b/tools/mlir-clang/Test/Verification/Inputs/cuda.h
@@ -0,0 +1,63 @@
+/* Minimal declarations for CUDA support.  Testing purposes only. */
+
+#include <stddef.h>
+
+#if __HIP__ || __CUDA__
+#define __constant__ __attribute__((constant))
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+#define __host__ __attribute__((host))
+#define __shared__ __attribute__((shared))
+#if __HIP__
+#define __managed__ __attribute__((managed))
+#endif
+#define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__)))
+#else
+#define __constant__
+#define __device__
+#define __global__
+#define __host__
+#define __shared__
+#define __managed__
+#define __launch_bounds__(...)
+#endif
+
+struct dim3 {
+  unsigned x, y, z;
+  __host__ __device__ dim3(unsigned x, unsigned y = 1, unsigned z = 1) : x(x), y(y), z(z) {}
+};
+
+#if __HIP__ || HIP_PLATFORM
+typedef struct hipStream *hipStream_t;
+typedef enum hipError {} hipError_t;
+int hipConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0,
+                     hipStream_t stream = 0);
+extern "C" hipError_t __hipPushCallConfiguration(dim3 gridSize, dim3 blockSize,
+                                                 size_t sharedSize = 0,
+                                                 hipStream_t stream = 0);
+#ifndef HIP_API_PER_THREAD_DEFAULT_STREAM
+extern "C" hipError_t hipLaunchKernel(const void *func, dim3 gridDim,
+                                      dim3 blockDim, void **args,
+                                      size_t sharedMem,
+                                      hipStream_t stream);
+#else
+extern "C" hipError_t hipLaunchKernel_spt(const void *func, dim3 gridDim,
+                                      dim3 blockDim, void **args,
+                                      size_t sharedMem,
+                                      hipStream_t stream);
+#endif //HIP_API_PER_THREAD_DEFAULT_STREAM
+#else
+typedef struct cudaStream *cudaStream_t;
+typedef enum cudaError {} cudaError_t;
+extern "C" int cudaConfigureCall(dim3 gridSize, dim3 blockSize,
+                                 size_t sharedSize = 0,
+                                 cudaStream_t stream = 0);
+extern "C" int __cudaPushCallConfiguration(dim3 gridSize, dim3 blockSize,
+                                           size_t sharedSize = 0,
+                                           cudaStream_t stream = 0);
+extern "C" cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim,
+                                        dim3 blockDim, void **args,
+                                        size_t sharedMem, cudaStream_t stream);
+#endif
+
+extern "C" __device__ int printf(const char*, ...);
diff --git a/tools/mlir-clang/Test/Verification/cudaglobalcodegen.cu b/tools/mlir-clang/Test/Verification/cudaglobalcodegen.cu
@@ -0,0 +1,29 @@
+// RUN: mlir-clang %s --cuda-gpu-arch=sm_60 -nocudalib -nocudainc %resourcedir --function=* -S | FileCheck %s
+
+#include "Inputs/cuda.h"
+
+__global__ void bar(int * a)
+{
+#ifdef __CUDA_ARCH__
+	*a = 1;
+#else
+	*a = 2;
+#endif
+}
+
+void baz(int * a){
+    bar<<<dim3(1,1,1), dim3(1,1,1)>>>(a);
+}
+// CHECK:  func private @_Z18__device_stub__barPi(%arg0: memref<?xi32>)
+// CHECK-NEXT:    %c1_i32 = arith.constant 1 : i32
+// CHECK-NEXT:    affine.store %c1_i32, %arg0[0] : memref<?xi32>
+// CHECK-NEXT:    return
+// CHECK-NEXT:  }
+// CHECK:  func @_Z3bazPi(%arg0: memref<?xi32>) attributes {llvm.linkage = #llvm.linkage<external>} {
+// CHECK-NEXT:    %c1 = arith.constant 1 : index
+// CHECK-NEXT:    gpu.launch blocks(%arg1, %arg2, %arg3) in (%arg7 = %c1, %arg8 = %c1, %arg9 = %c1) threads(%arg4, %arg5, %arg6) in (%arg10 = %c1, %arg11 = %c1, %arg12 = %c1) {
+// CHECK-NEXT:      call @_Z18__device_stub__barPi(%arg0) : (memref<?xi32>) -> ()
+// CHECK-NEXT:      gpu.terminator
+// CHECK-NEXT:    }
+// CHECK-NEXT:    return
+// CHECK-NEXT:  }
diff --git a/tools/mlir-clang/Test/lit.cfg b/tools/mlir-clang/Test/lit.cfg
@@ -59,6 +59,12 @@ llvm_config.add_tool_substitutions(tools, tool_dirs)
 tool_dirs = [config.polygeist_tools_dir]
 tools = [ 'mlir-clang' ]
 llvm_config.add_tool_substitutions(tools, tool_dirs)
-config.substitutions.append(('%stdinclude', '-I ' + config.clang_header_dir + " -I " + config.test_source_root + "/polybench/utilities"))
+
+import subprocess
+
+resource_dir = subprocess.check_output([config.llvm_tools_dir + "/clang", "-print-resource-dir"]).decode('utf-8').strip()
+
+config.substitutions.append(('%stdinclude', '-resource-dir=' + resource_dir + " -I " + config.test_source_root + "/polybench/utilities"))
+config.substitutions.append(('%resourcedir', '-resource-dir=' + resource_dir))
 config.substitutions.append(('%polyexec', config.test_source_root + '/polybench/utilities/polybench.c -D POLYBENCH_TIME -D POLYBENCH_NO_FLUSH_CACHE -D MINI_DATASET'))
 config.substitutions.append(('%polyverify', config.test_source_root + '/polybench/utilities/polybench.c -D POLYBENCH_DUMP_ARRAYS -D POLYBENCH_NO_FLUSH_CACHE -D MINI_DATASET'))
diff --git a/tools/mlir-clang/Test/lit.site.cfg.in b/tools/mlir-clang/Test/lit.site.cfg.in
@@ -6,7 +6,6 @@ config.llvm_tools_dir = path(r"@LLVM_TOOLS_DIR@")
 config.polygeist_tools_dir = path(r"@POLYGEIST_TOOLS_DIR@")
 config.lit_tools_dir = "@LLVM_LIT_TOOLS_DIR@"
 config.mlir_clang_obj_root = "@MLIR_CLANG_BINARY_DIR@"
-config.clang_header_dir = "@CLANG_HEADER_DIR@"
 config.target_triple = "@TARGET_TRIPLE@"
 config.llvm_obj_root = path(r"@LLVM_BINARY_DIR@")
 
diff --git a/tools/mlir-clang/mlir-clang.cc b/tools/mlir-clang/mlir-clang.cc
@@ -111,6 +111,12 @@ static cl::opt<std::string> CUDAGPUArch("cuda-gpu-arch", cl::init(""),
 static cl::opt<std::string> CUDAPath("cuda-path", cl::init(""),
                                      cl::desc("CUDA Path"));
 
+static cl::opt<bool> NoCUDAInc("nocudainc", cl::init(false),
+                               cl::desc("Do not include CUDA headers"));
+
+static cl::opt<bool> NoCUDALib("nocudalib", cl::init(false),
+                               cl::desc("Do not link CUDA libdevice"));
+
 static cl::opt<std::string> Output("o", cl::init("-"), cl::desc("Output file"));
 
 static cl::opt<std::string> cfunction("function",