Skip to content

Commit f650eae

Browse files
authored
Global function code gen fix (#208)
* Call device stub from host code for global functions * Update tests * Don't reverse inputs * Add a test case for cuda global code gen * Add nocuda{inc,lib} options and fix cuda test * clang format * Add barebones cuda header for cuda tests
1 parent 2182fa1 commit f650eae

File tree

8 files changed

+129
-7
lines changed

8 files changed

+129
-7
lines changed

tools/mlir-clang/Lib/clang-mlir.cc

+21-2
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,15 @@ void MLIRScanner::init(mlir::func::FuncOp function, const FunctionDecl *fd) {
164164
i++;
165165
}
166166

167+
if (fd->hasAttr<CUDAGlobalAttr>() && Glob.CGM.getLangOpts().CUDA &&
168+
!Glob.CGM.getLangOpts().CUDAIsDevice) {
169+
auto deviceStub =
170+
Glob.GetOrCreateMLIRFunction(fd, /* getDeviceStub */ true);
171+
builder.create<func::CallOp>(loc, deviceStub, function.getArguments());
172+
builder.create<ReturnOp>(loc);
173+
return;
174+
}
175+
167176
if (auto CC = dyn_cast<CXXConstructorDecl>(fd)) {
168177
const CXXRecordDecl *ClassDecl = CC->getParent();
169178
for (auto expr : CC->inits()) {
@@ -4366,14 +4375,18 @@ mlir::Value MLIRASTConsumer::GetOrCreateGlobalLLVMString(
43664375
}
43674376

43684377
mlir::func::FuncOp
4369-
MLIRASTConsumer::GetOrCreateMLIRFunction(const FunctionDecl *FD) {
4378+
MLIRASTConsumer::GetOrCreateMLIRFunction(const FunctionDecl *FD,
4379+
bool getDeviceStub) {
43704380
assert(FD->getTemplatedKind() !=
43714381
FunctionDecl::TemplatedKind::TK_FunctionTemplate);
43724382
assert(
43734383
FD->getTemplatedKind() !=
43744384
FunctionDecl::TemplatedKind::TK_DependentFunctionTemplateSpecialization);
43754385
std::string name;
4376-
if (auto CC = dyn_cast<CXXConstructorDecl>(FD))
4386+
if (getDeviceStub)
4387+
name =
4388+
CGM.getMangledName(GlobalDecl(FD, KernelReferenceKind::Kernel)).str();
4389+
else if (auto CC = dyn_cast<CXXConstructorDecl>(FD))
43774390
name = CGM.getMangledName(GlobalDecl(CC, CXXCtorType::Ctor_Complete)).str();
43784391
else if (auto CC = dyn_cast<CXXDestructorDecl>(FD))
43794392
name = CGM.getMangledName(GlobalDecl(CC, CXXDtorType::Dtor_Complete)).str();
@@ -5279,6 +5292,12 @@ static bool parseMLIR(const char *Argv0, std::vector<std::string> filenames,
52795292
if (Verbose) {
52805293
Argv.push_back("-v");
52815294
}
5295+
if (NoCUDAInc) {
5296+
Argv.push_back("-nocudainc");
5297+
}
5298+
if (NoCUDALib) {
5299+
Argv.push_back("-nocudalib");
5300+
}
52825301
if (CUDAGPUArch != "") {
52835302
auto a = "--cuda-gpu-arch=" + CUDAGPUArch;
52845303
char *chars = (char *)malloc(a.length() + 1);

tools/mlir-clang/Lib/clang-mlir.h

+2-1
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,8 @@ struct MLIRASTConsumer : public ASTConsumer {
100100

101101
~MLIRASTConsumer() {}
102102

103-
mlir::func::FuncOp GetOrCreateMLIRFunction(const FunctionDecl *FD);
103+
mlir::func::FuncOp GetOrCreateMLIRFunction(const FunctionDecl *FD,
104+
bool getDeviceStub = false);
104105

105106
mlir::LLVM::LLVMFuncOp GetOrCreateLLVMFunction(const FunctionDecl *FD);
106107
mlir::LLVM::LLVMFuncOp GetOrCreateMallocFunction();

tools/mlir-clang/Test/CMakeLists.txt

+1-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
set(MLIR_CLANG_TEST_DIR ${CMAKE_CURRENT_SOURCE_DIR})
22
set(MLIR_CLANG_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
3-
set(CLANG_HEADER_DIR ${LLVM_BUILD_MAIN_SRC_DIR}/../clang/lib/Headers)
43

54
configure_lit_site_cfg(
65
${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.in
@@ -28,4 +27,4 @@ add_lit_testsuite(check-mlir-clang-single "Running the clang-to-mlir regression
2827
ARGS -j 1
2928
)
3029

31-
set_target_properties(check-mlir-clang PROPERTIES FOLDER "clang-to-mlir tests")
30+
set_target_properties(check-mlir-clang PROPERTIES FOLDER "clang-to-mlir tests")
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
/* Minimal declarations for CUDA support. Testing purposes only. */
2+
3+
#include <stddef.h>
4+
5+
#if __HIP__ || __CUDA__
6+
#define __constant__ __attribute__((constant))
7+
#define __device__ __attribute__((device))
8+
#define __global__ __attribute__((global))
9+
#define __host__ __attribute__((host))
10+
#define __shared__ __attribute__((shared))
11+
#if __HIP__
12+
#define __managed__ __attribute__((managed))
13+
#endif
14+
#define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__)))
15+
#else
16+
#define __constant__
17+
#define __device__
18+
#define __global__
19+
#define __host__
20+
#define __shared__
21+
#define __managed__
22+
#define __launch_bounds__(...)
23+
#endif
24+
25+
struct dim3 {
26+
unsigned x, y, z;
27+
__host__ __device__ dim3(unsigned x, unsigned y = 1, unsigned z = 1) : x(x), y(y), z(z) {}
28+
};
29+
30+
#if __HIP__ || HIP_PLATFORM
31+
typedef struct hipStream *hipStream_t;
32+
typedef enum hipError {} hipError_t;
33+
int hipConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0,
34+
hipStream_t stream = 0);
35+
extern "C" hipError_t __hipPushCallConfiguration(dim3 gridSize, dim3 blockSize,
36+
size_t sharedSize = 0,
37+
hipStream_t stream = 0);
38+
#ifndef HIP_API_PER_THREAD_DEFAULT_STREAM
39+
extern "C" hipError_t hipLaunchKernel(const void *func, dim3 gridDim,
40+
dim3 blockDim, void **args,
41+
size_t sharedMem,
42+
hipStream_t stream);
43+
#else
44+
extern "C" hipError_t hipLaunchKernel_spt(const void *func, dim3 gridDim,
45+
dim3 blockDim, void **args,
46+
size_t sharedMem,
47+
hipStream_t stream);
48+
#endif //HIP_API_PER_THREAD_DEFAULT_STREAM
49+
#else
50+
typedef struct cudaStream *cudaStream_t;
51+
typedef enum cudaError {} cudaError_t;
52+
extern "C" int cudaConfigureCall(dim3 gridSize, dim3 blockSize,
53+
size_t sharedSize = 0,
54+
cudaStream_t stream = 0);
55+
extern "C" int __cudaPushCallConfiguration(dim3 gridSize, dim3 blockSize,
56+
size_t sharedSize = 0,
57+
cudaStream_t stream = 0);
58+
extern "C" cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim,
59+
dim3 blockDim, void **args,
60+
size_t sharedMem, cudaStream_t stream);
61+
#endif
62+
63+
extern "C" __device__ int printf(const char*, ...);
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
// RUN: mlir-clang %s --cuda-gpu-arch=sm_60 -nocudalib -nocudainc %resourcedir --function=* -S | FileCheck %s
2+
3+
#include "Inputs/cuda.h"
4+
5+
__global__ void bar(int * a)
6+
{
7+
#ifdef __CUDA_ARCH__
8+
*a = 1;
9+
#else
10+
*a = 2;
11+
#endif
12+
}
13+
14+
void baz(int * a){
15+
bar<<<dim3(1,1,1), dim3(1,1,1)>>>(a);
16+
}
17+
// CHECK: func private @_Z18__device_stub__barPi(%arg0: memref<?xi32>)
18+
// CHECK-NEXT: %c1_i32 = arith.constant 1 : i32
19+
// CHECK-NEXT: affine.store %c1_i32, %arg0[0] : memref<?xi32>
20+
// CHECK-NEXT: return
21+
// CHECK-NEXT: }
22+
// CHECK: func @_Z3bazPi(%arg0: memref<?xi32>) attributes {llvm.linkage = #llvm.linkage<external>} {
23+
// CHECK-NEXT: %c1 = arith.constant 1 : index
24+
// CHECK-NEXT: gpu.launch blocks(%arg1, %arg2, %arg3) in (%arg7 = %c1, %arg8 = %c1, %arg9 = %c1) threads(%arg4, %arg5, %arg6) in (%arg10 = %c1, %arg11 = %c1, %arg12 = %c1) {
25+
// CHECK-NEXT: call @_Z18__device_stub__barPi(%arg0) : (memref<?xi32>) -> ()
26+
// CHECK-NEXT: gpu.terminator
27+
// CHECK-NEXT: }
28+
// CHECK-NEXT: return
29+
// CHECK-NEXT: }

tools/mlir-clang/Test/lit.cfg

+7-1
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,12 @@ llvm_config.add_tool_substitutions(tools, tool_dirs)
5959
tool_dirs = [config.polygeist_tools_dir]
6060
tools = [ 'mlir-clang' ]
6161
llvm_config.add_tool_substitutions(tools, tool_dirs)
62-
config.substitutions.append(('%stdinclude', '-I ' + config.clang_header_dir + " -I " + config.test_source_root + "/polybench/utilities"))
62+
63+
import subprocess
64+
65+
resource_dir = subprocess.check_output([config.llvm_tools_dir + "/clang", "-print-resource-dir"]).decode('utf-8').strip()
66+
67+
config.substitutions.append(('%stdinclude', '-resource-dir=' + resource_dir + " -I " + config.test_source_root + "/polybench/utilities"))
68+
config.substitutions.append(('%resourcedir', '-resource-dir=' + resource_dir))
6369
config.substitutions.append(('%polyexec', config.test_source_root + '/polybench/utilities/polybench.c -D POLYBENCH_TIME -D POLYBENCH_NO_FLUSH_CACHE -D MINI_DATASET'))
6470
config.substitutions.append(('%polyverify', config.test_source_root + '/polybench/utilities/polybench.c -D POLYBENCH_DUMP_ARRAYS -D POLYBENCH_NO_FLUSH_CACHE -D MINI_DATASET'))

tools/mlir-clang/Test/lit.site.cfg.in

-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ config.llvm_tools_dir = path(r"@LLVM_TOOLS_DIR@")
66
config.polygeist_tools_dir = path(r"@POLYGEIST_TOOLS_DIR@")
77
config.lit_tools_dir = "@LLVM_LIT_TOOLS_DIR@"
88
config.mlir_clang_obj_root = "@MLIR_CLANG_BINARY_DIR@"
9-
config.clang_header_dir = "@CLANG_HEADER_DIR@"
109
config.target_triple = "@TARGET_TRIPLE@"
1110
config.llvm_obj_root = path(r"@LLVM_BINARY_DIR@")
1211

tools/mlir-clang/mlir-clang.cc

+6
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,12 @@ static cl::opt<std::string> CUDAGPUArch("cuda-gpu-arch", cl::init(""),
111111
static cl::opt<std::string> CUDAPath("cuda-path", cl::init(""),
112112
cl::desc("CUDA Path"));
113113

114+
static cl::opt<bool> NoCUDAInc("nocudainc", cl::init(false),
115+
cl::desc("Do not include CUDA headers"));
116+
117+
static cl::opt<bool> NoCUDALib("nocudalib", cl::init(false),
118+
cl::desc("Do not link CUDA libdevice"));
119+
114120
static cl::opt<std::string> Output("o", cl::init("-"), cl::desc("Output file"));
115121

116122
static cl::opt<std::string> cfunction("function",

0 commit comments

Comments
 (0)