22
22
#include " mlir/Interfaces/DataLayoutInterfaces.h"
23
23
#include " mlir/Pass/PassManager.h"
24
24
25
+ #ifdef GC_ENABLE_GPU_PROFILE
26
+ #include " PtiGpuUtils.h"
27
+ #include " pti/pti_view.h"
28
+ std::map<std::pair<pti_view_external_kind, uint64_t >, std::vector<uint32_t >>
29
+ external_corr_map;
30
+ std::map<uint32_t , std::string> runtime_enq_2_gpu_kernel_name_map;
31
+ std::map<uint32_t , std::string> runtime_enq_2_gpu_mem_op_name_map;
32
+
33
+ class GPUKernelTracer {
34
+ public:
35
+ GPUKernelTracer () {
36
+ gcLogD (" Enable Profiling." );
37
+ ptiViewSetCallbacks (
38
+ [](auto **buf, auto *buf_size) {
39
+ *buf_size = sizeof (pti_view_record_kernel) * 100 ;
40
+ void *ptr = ::operator new (*buf_size);
41
+ ptr = std::align (8 , sizeof (unsigned char ), ptr, *buf_size);
42
+ *buf = reinterpret_cast <unsigned char *>(ptr);
43
+ if (!*buf) {
44
+ std::abort ();
45
+ }
46
+ return ;
47
+ },
48
+ [](auto *buf, auto buf_size, auto valid_buf_size) {
49
+ if (!buf_size || !valid_buf_size || !buf_size) {
50
+ std::cerr << " Received empty buffer" << ' \n ' ;
51
+ if (valid_buf_size) {
52
+ ::operator delete (buf);
53
+ }
54
+ return ;
55
+ }
56
+ pti_view_record_base *ptr = nullptr ;
57
+ while (true ) {
58
+ auto buf_status = ptiViewGetNextRecord (buf, valid_buf_size, &ptr);
59
+ if (buf_status == pti_result::PTI_STATUS_END_OF_BUFFER) {
60
+ std::cout << " Reached End of buffer" << ' \n ' ;
61
+ break ;
62
+ }
63
+ if (buf_status != pti_result::PTI_SUCCESS) {
64
+ std::cerr << " Found Error Parsing Records from PTI" << ' \n ' ;
65
+ break ;
66
+ }
67
+ switch (ptr->_view_kind ) {
68
+ case pti_view_kind::PTI_VIEW_INVALID: {
69
+ std::cout << " Found Invalid Record" << ' \n ' ;
70
+ break ;
71
+ }
72
+ case pti_view_kind::PTI_VIEW_DEVICE_GPU_MEM_COPY: {
73
+ std::cout << " ---------------------------------------------------"
74
+ " -----------------------------"
75
+ << ' \n ' ;
76
+ pti_view_record_memory_copy *rec =
77
+ reinterpret_cast <pti_view_record_memory_copy *>(ptr);
78
+ runtime_enq_2_gpu_mem_op_name_map[rec->_correlation_id ] =
79
+ rec->_name ;
80
+ std::cout << " Found Memory Record" << ' \n ' ;
81
+ samples_utils::dump_record (rec);
82
+ std::cout << " ---------------------------------------------------"
83
+ " -----------------------------"
84
+ << ' \n ' ;
85
+ break ;
86
+ }
87
+ case pti_view_kind::PTI_VIEW_DEVICE_GPU_MEM_FILL: {
88
+ std::cout << " ---------------------------------------------------"
89
+ " -----------------------------"
90
+ << ' \n ' ;
91
+ pti_view_record_memory_fill *rec =
92
+ reinterpret_cast <pti_view_record_memory_fill *>(ptr);
93
+ runtime_enq_2_gpu_mem_op_name_map[rec->_correlation_id ] =
94
+ rec->_name ;
95
+ std::cout << " Found Memory Record" << ' \n ' ;
96
+ samples_utils::dump_record (rec);
97
+ std::cout << " ---------------------------------------------------"
98
+ " -----------------------------"
99
+ << ' \n ' ;
100
+ break ;
101
+ }
102
+ case pti_view_kind::PTI_VIEW_DEVICE_GPU_KERNEL: {
103
+ std::cout << " ---------------------------------------------------"
104
+ " -----------------------------"
105
+ << ' \n ' ;
106
+ pti_view_record_kernel *rec =
107
+ reinterpret_cast <pti_view_record_kernel *>(ptr);
108
+ runtime_enq_2_gpu_kernel_name_map[rec->_correlation_id ] =
109
+ rec->_name ;
110
+ std::cout << " Found Kernel Record" << ' \n ' ;
111
+ samples_utils::dump_record (rec);
112
+
113
+ std::cout << " ---------------------------------------------------"
114
+ " -----------------------------"
115
+ << ' \n ' ;
116
+ if (samples_utils::isMonotonic (
117
+ {rec->_sycl_task_begin_timestamp ,
118
+ rec->_sycl_enqk_begin_timestamp , rec->_append_timestamp ,
119
+ rec->_submit_timestamp , rec->_start_timestamp ,
120
+ rec->_end_timestamp })) {
121
+ std::cout << " ------------> All Monotonic" << std::endl;
122
+ } else {
123
+ std::cerr
124
+ << " ------------> Something wrong: NOT All monotonic"
125
+ << std::endl;
126
+ };
127
+ if (rec->_sycl_task_begin_timestamp == 0 ) {
128
+ std::cerr << " ------------> Something wrong: Sycl Task "
129
+ " Begin Time is 0"
130
+ << std::endl;
131
+ }
132
+ if (rec->_sycl_enqk_begin_timestamp == 0 ) {
133
+ std::cerr << " ------------> Something wrong: Sycl Enq "
134
+ " Launch Kernel Time is 0"
135
+ << std::endl;
136
+ }
137
+
138
+ break ;
139
+ }
140
+ case pti_view_kind::PTI_VIEW_EXTERNAL_CORRELATION: {
141
+ std::cout << " ---------------------------------------------------"
142
+ " -----------------------------"
143
+ << ' \n ' ;
144
+ pti_view_record_external_correlation *rec =
145
+ reinterpret_cast <pti_view_record_external_correlation *>(ptr);
146
+
147
+ external_corr_map[std::pair{rec->_external_kind ,
148
+ rec->_external_id }]
149
+ .push_back (rec->_correlation_id );
150
+ samples_utils::dump_record (rec);
151
+ break ;
152
+ }
153
+ case pti_view_kind::PTI_VIEW_OPENCL_CALLS: {
154
+ std::cout << " ---------------------------------------------------"
155
+ " -----------------------------"
156
+ << ' \n ' ;
157
+ pti_view_record_oclcalls *rec =
158
+ reinterpret_cast <pti_view_record_oclcalls *>(ptr);
159
+ samples_utils::dump_record (rec);
160
+ break ;
161
+ }
162
+ default : {
163
+ std::cerr << " This shouldn't happen" << ' \n ' ;
164
+ break ;
165
+ }
166
+ }
167
+ }
168
+ ::operator delete (buf);
169
+ });
170
+ ptiViewSetOclProfiling ();
171
+
172
+ ptiViewEnable (PTI_VIEW_DEVICE_GPU_KERNEL);
173
+ ptiViewEnable (PTI_VIEW_DEVICE_GPU_MEM_COPY);
174
+ ptiViewEnable (PTI_VIEW_DEVICE_GPU_MEM_FILL);
175
+ ptiViewEnable (PTI_VIEW_OPENCL_CALLS);
176
+ ptiViewEnable (PTI_VIEW_EXTERNAL_CORRELATION);
177
+ }
178
+
179
+ ~GPUKernelTracer () {
180
+ gcLogD (" Profiling is finished." );
181
+ ptiViewDisable (PTI_VIEW_DEVICE_GPU_KERNEL);
182
+ ptiViewDisable (PTI_VIEW_DEVICE_GPU_MEM_COPY);
183
+ ptiViewDisable (PTI_VIEW_DEVICE_GPU_MEM_FILL);
184
+ ptiViewEnable (PTI_VIEW_OPENCL_CALLS);
185
+ ptiViewDisable (PTI_VIEW_EXTERNAL_CORRELATION);
186
+ ptiFlushAllViews ();
187
+ }
188
+ };
189
+
190
+ /*
191
+ Create an RAII tracer with a static life cycle to trace all device kernel
192
+ execution during the program. When the tracer's constructor is called, the
193
+ EnableProfiling will also be called, registering some metric collection
194
+ call-back function into the opencl function call. When the tracer is destroyed,
195
+ the DisableProfiling is also called which will statistic the collected metric
196
+ during the tracer lifetime and print the result. The concrete implementation of
197
+ EnableProfiling and DisableProfiling could refer to
198
+ https://github.com/intel/pti-gpu/blob/master/tools/onetrace/tool.cc.
199
+ */
200
+ static GPUKernelTracer tracer;
201
+
202
+ #endif
203
+
25
204
namespace mlir ::gc::gpu {
26
205
27
206
#define makeClErrPref (code ) " OpenCL error " , code, " : "
@@ -128,10 +307,9 @@ struct Kernel {
128
307
129
308
explicit Kernel (cl_program program, cl_kernel kernel, const size_t *gridSize,
130
309
const size_t *blockSize, size_t argNum, const size_t *argSize)
131
- : program(program),
132
- kernel(kernel), globalSize{gridSize[0 ] * blockSize[0 ],
133
- gridSize[1 ] * blockSize[1 ],
134
- gridSize[2 ] * blockSize[2 ]},
310
+ : program(program), kernel(kernel),
311
+ globalSize{gridSize[0 ] * blockSize[0 ], gridSize[1 ] * blockSize[1 ],
312
+ gridSize[2 ] * blockSize[2 ]},
135
313
localSize{blockSize[0 ], blockSize[1 ], blockSize[2 ]},
136
314
argSize (argSize, argSize + argNum) {
137
315
#ifndef NDEBUG
@@ -1014,4 +1192,4 @@ OclModuleBuilder::build(const OclRuntime::Ext &ext) {
1014
1192
return cache.emplace (OclDevCtxPair (ext.device , ext.context ), ptr)
1015
1193
.first ->second ;
1016
1194
}
1017
- } // namespace mlir::gc::gpu
1195
+ } // namespace mlir::gc::gpu
0 commit comments