Skip to content

Commit 4b9bb13

Browse files
committed
add tracing support for gpu
1 parent 238706b commit 4b9bb13

File tree

6 files changed

+521
-5
lines changed

6 files changed

+521
-5
lines changed

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ option(GC_ENABLE_BINDINGS_PYTHON "Enable Graph Complier Python Binding" ON)
5050
option(GC_DEV_LINK_LLVM_DYLIB "Link dynamic libraries of LLVM and MLIR. For developers only. Do not use it in packing the library." OFF)
5151
option(GC_ENABLE_RUNTIME_NAIVE_BRGEMM "Use naive BRGEMM as runtime backend for debug purpose." OFF)
5252
option(GC_BENCH_ENABLE "Build benchgc." ON)
53+
option(GC_ENABLE_GPU_PROFILE "Enable the GPU kernel profiling." OFF)
5354

5455
if(GC_ENABLE_LEGACY)
5556
add_subdirectory(legacy/core)

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,4 +77,5 @@ Graph Compiler supports the following build-time options.
7777
| GC_DEV_LINK_LLVM_DYLIB | ON, **OFF** | Controls dynamic link LLVM/MLIR libraries, mainly for developer |
7878
| GC_ENABLE_BINDINGS_PYTHON | **ON**, OFF | Controls building the Python API |
7979
| GC_ENABLE_IMEX | ON, **OFF** | Whether to enable the GPU components |
80+
| GC_ENABLE_GPU_PROFILE | ON, **OFF** | Whether to enable the GPU profiling which will profile the kernel execution time |
8081

cmake/ptigpu.cmake

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
include_guard()
2+
3+
FetchContent_Declare(
4+
PTIGPU
5+
GIT_REPOSITORY https://github.com/intel/pti-gpu.git
6+
GIT_TAG exp_opencl_0.11.0
7+
SOURCE_SUBDIR sdk
8+
)
9+
FetchContent_MakeAvailable(PTIGPU)

lib/gc/ExecutionEngine/GPURuntime/ocl/CMakeLists.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,10 @@ gc_add_mlir_library(GcGpuOclRuntime
88
)
99
target_include_directories(GcGpuOclRuntime PUBLIC ${OpenCL_INCLUDE_DIRS})
1010
target_link_libraries(GcGpuOclRuntime PUBLIC ${OpenCL_LIBRARIES})
11+
12+
if(GC_ENABLE_GPU_PROFILE)
13+
include(ptigpu)
14+
find_package(Pti REQUIRED)
15+
target_link_libraries(GcGpuOclRuntime PRIVATE Pti::pti_view)
16+
add_definitions(-DGC_ENABLE_GPU_PROFILE)
17+
endif ()

lib/gc/ExecutionEngine/GPURuntime/ocl/GpuOclRuntime.cpp

Lines changed: 183 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,185 @@
2222
#include "mlir/Interfaces/DataLayoutInterfaces.h"
2323
#include "mlir/Pass/PassManager.h"
2424

25+
#ifdef GC_ENABLE_GPU_PROFILE
26+
#include "PtiGpuUtils.h"
27+
#include "pti/pti_view.h"
28+
std::map<std::pair<pti_view_external_kind, uint64_t>, std::vector<uint32_t>>
29+
external_corr_map;
30+
std::map<uint32_t, std::string> runtime_enq_2_gpu_kernel_name_map;
31+
std::map<uint32_t, std::string> runtime_enq_2_gpu_mem_op_name_map;
32+
33+
class GPUKernelTracer {
34+
public:
35+
GPUKernelTracer() {
36+
gcLogD("Enable Profiling.");
37+
ptiViewSetCallbacks(
38+
[](auto **buf, auto *buf_size) {
39+
*buf_size = sizeof(pti_view_record_kernel) * 100;
40+
void *ptr = ::operator new(*buf_size);
41+
ptr = std::align(8, sizeof(unsigned char), ptr, *buf_size);
42+
*buf = reinterpret_cast<unsigned char *>(ptr);
43+
if (!*buf) {
44+
std::abort();
45+
}
46+
return;
47+
},
48+
[](auto *buf, auto buf_size, auto valid_buf_size) {
49+
if (!buf_size || !valid_buf_size || !buf_size) {
50+
std::cerr << "Received empty buffer" << '\n';
51+
if (valid_buf_size) {
52+
::operator delete(buf);
53+
}
54+
return;
55+
}
56+
pti_view_record_base *ptr = nullptr;
57+
while (true) {
58+
auto buf_status = ptiViewGetNextRecord(buf, valid_buf_size, &ptr);
59+
if (buf_status == pti_result::PTI_STATUS_END_OF_BUFFER) {
60+
std::cout << "Reached End of buffer" << '\n';
61+
break;
62+
}
63+
if (buf_status != pti_result::PTI_SUCCESS) {
64+
std::cerr << "Found Error Parsing Records from PTI" << '\n';
65+
break;
66+
}
67+
switch (ptr->_view_kind) {
68+
case pti_view_kind::PTI_VIEW_INVALID: {
69+
std::cout << "Found Invalid Record" << '\n';
70+
break;
71+
}
72+
case pti_view_kind::PTI_VIEW_DEVICE_GPU_MEM_COPY: {
73+
std::cout << "---------------------------------------------------"
74+
"-----------------------------"
75+
<< '\n';
76+
pti_view_record_memory_copy *rec =
77+
reinterpret_cast<pti_view_record_memory_copy *>(ptr);
78+
runtime_enq_2_gpu_mem_op_name_map[rec->_correlation_id] =
79+
rec->_name;
80+
std::cout << "Found Memory Record" << '\n';
81+
samples_utils::dump_record(rec);
82+
std::cout << "---------------------------------------------------"
83+
"-----------------------------"
84+
<< '\n';
85+
break;
86+
}
87+
case pti_view_kind::PTI_VIEW_DEVICE_GPU_MEM_FILL: {
88+
std::cout << "---------------------------------------------------"
89+
"-----------------------------"
90+
<< '\n';
91+
pti_view_record_memory_fill *rec =
92+
reinterpret_cast<pti_view_record_memory_fill *>(ptr);
93+
runtime_enq_2_gpu_mem_op_name_map[rec->_correlation_id] =
94+
rec->_name;
95+
std::cout << "Found Memory Record" << '\n';
96+
samples_utils::dump_record(rec);
97+
std::cout << "---------------------------------------------------"
98+
"-----------------------------"
99+
<< '\n';
100+
break;
101+
}
102+
case pti_view_kind::PTI_VIEW_DEVICE_GPU_KERNEL: {
103+
std::cout << "---------------------------------------------------"
104+
"-----------------------------"
105+
<< '\n';
106+
pti_view_record_kernel *rec =
107+
reinterpret_cast<pti_view_record_kernel *>(ptr);
108+
runtime_enq_2_gpu_kernel_name_map[rec->_correlation_id] =
109+
rec->_name;
110+
std::cout << "Found Kernel Record" << '\n';
111+
samples_utils::dump_record(rec);
112+
113+
std::cout << "---------------------------------------------------"
114+
"-----------------------------"
115+
<< '\n';
116+
if (samples_utils::isMonotonic(
117+
{rec->_sycl_task_begin_timestamp,
118+
rec->_sycl_enqk_begin_timestamp, rec->_append_timestamp,
119+
rec->_submit_timestamp, rec->_start_timestamp,
120+
rec->_end_timestamp})) {
121+
std::cout << "------------> All Monotonic" << std::endl;
122+
} else {
123+
std::cerr
124+
<< "------------> Something wrong: NOT All monotonic"
125+
<< std::endl;
126+
};
127+
if (rec->_sycl_task_begin_timestamp == 0) {
128+
std::cerr << "------------> Something wrong: Sycl Task "
129+
"Begin Time is 0"
130+
<< std::endl;
131+
}
132+
if (rec->_sycl_enqk_begin_timestamp == 0) {
133+
std::cerr << "------------> Something wrong: Sycl Enq "
134+
"Launch Kernel Time is 0"
135+
<< std::endl;
136+
}
137+
138+
break;
139+
}
140+
case pti_view_kind::PTI_VIEW_EXTERNAL_CORRELATION: {
141+
std::cout << "---------------------------------------------------"
142+
"-----------------------------"
143+
<< '\n';
144+
pti_view_record_external_correlation *rec =
145+
reinterpret_cast<pti_view_record_external_correlation *>(ptr);
146+
147+
external_corr_map[std::pair{rec->_external_kind,
148+
rec->_external_id}]
149+
.push_back(rec->_correlation_id);
150+
samples_utils::dump_record(rec);
151+
break;
152+
}
153+
case pti_view_kind::PTI_VIEW_OPENCL_CALLS: {
154+
std::cout << "---------------------------------------------------"
155+
"-----------------------------"
156+
<< '\n';
157+
pti_view_record_oclcalls *rec =
158+
reinterpret_cast<pti_view_record_oclcalls *>(ptr);
159+
samples_utils::dump_record(rec);
160+
break;
161+
}
162+
default: {
163+
std::cerr << "This shouldn't happen" << '\n';
164+
break;
165+
}
166+
}
167+
}
168+
::operator delete(buf);
169+
});
170+
ptiViewSetOclProfiling();
171+
172+
ptiViewEnable(PTI_VIEW_DEVICE_GPU_KERNEL);
173+
ptiViewEnable(PTI_VIEW_DEVICE_GPU_MEM_COPY);
174+
ptiViewEnable(PTI_VIEW_DEVICE_GPU_MEM_FILL);
175+
ptiViewEnable(PTI_VIEW_OPENCL_CALLS);
176+
ptiViewEnable(PTI_VIEW_EXTERNAL_CORRELATION);
177+
}
178+
179+
~GPUKernelTracer() {
180+
gcLogD("Profiling is finished.");
181+
ptiViewDisable(PTI_VIEW_DEVICE_GPU_KERNEL);
182+
ptiViewDisable(PTI_VIEW_DEVICE_GPU_MEM_COPY);
183+
ptiViewDisable(PTI_VIEW_DEVICE_GPU_MEM_FILL);
184+
ptiViewEnable(PTI_VIEW_OPENCL_CALLS);
185+
ptiViewDisable(PTI_VIEW_EXTERNAL_CORRELATION);
186+
ptiFlushAllViews();
187+
}
188+
};
189+
190+
/*
191+
Create an RAII tracer with a static life cycle to trace all device kernel
192+
execution during the program. When the tracer's constructor is called, the
193+
EnableProfiling will also be called, registering some metric collection
194+
call-back function into the opencl function call. When the tracer is destroyed,
195+
the DisableProfiling is also called which will statistic the collected metric
196+
during the tracer lifetime and print the result. The concrete implementation of
197+
EnableProfiling and DisableProfiling could refer to
198+
https://github.com/intel/pti-gpu/blob/master/tools/onetrace/tool.cc.
199+
*/
200+
static GPUKernelTracer tracer;
201+
202+
#endif
203+
25204
namespace mlir::gc::gpu {
26205

27206
#define makeClErrPref(code) "OpenCL error ", code, ": "
@@ -128,10 +307,9 @@ struct Kernel {
128307

129308
explicit Kernel(cl_program program, cl_kernel kernel, const size_t *gridSize,
130309
const size_t *blockSize, size_t argNum, const size_t *argSize)
131-
: program(program),
132-
kernel(kernel), globalSize{gridSize[0] * blockSize[0],
133-
gridSize[1] * blockSize[1],
134-
gridSize[2] * blockSize[2]},
310+
: program(program), kernel(kernel),
311+
globalSize{gridSize[0] * blockSize[0], gridSize[1] * blockSize[1],
312+
gridSize[2] * blockSize[2]},
135313
localSize{blockSize[0], blockSize[1], blockSize[2]},
136314
argSize(argSize, argSize + argNum) {
137315
#ifndef NDEBUG
@@ -1014,4 +1192,4 @@ OclModuleBuilder::build(const OclRuntime::Ext &ext) {
10141192
return cache.emplace(OclDevCtxPair(ext.device, ext.context), ptr)
10151193
.first->second;
10161194
}
1017-
} // namespace mlir::gc::gpu
1195+
} // namespace mlir::gc::gpu

0 commit comments

Comments
 (0)