2121#include " mlir/Dialect/LLVMIR/LLVMDialect.h"
2222#include " mlir/Interfaces/DataLayoutInterfaces.h"
2323#include " mlir/Pass/PassManager.h"
24+ #include " pti/pti_view.h"
25+
26+ #ifdef GC_ENABLE_GPU_PROFILE
27+ #include " pti_gpu_utils.h"
28+ std::map<std::pair<pti_view_external_kind, uint64_t >, std::vector<uint32_t >>
29+ external_corr_map;
30+ std::map<uint32_t , std::string> runtime_enq_2_gpu_kernel_name_map;
31+ std::map<uint32_t , std::string> runtime_enq_2_gpu_mem_op_name_map;
32+
33+ class GPUKernelTracer {
34+ public:
35+ GPUKernelTracer () {
36+ gcLogD (" Enable Profiling." );
37+ ptiViewSetCallbacks (
38+ [](auto **buf, auto *buf_size) {
39+ *buf_size = sizeof (pti_view_record_kernel) * 100 ;
40+ void *ptr = ::operator new (*buf_size);
41+ ptr = std::align (8 , sizeof (unsigned char ), ptr, *buf_size);
42+ *buf = reinterpret_cast <unsigned char *>(ptr);
43+ if (!*buf) {
44+ std::abort ();
45+ }
46+ return ;
47+ },
48+ [](auto *buf, auto buf_size, auto valid_buf_size) {
49+ if (!buf_size || !valid_buf_size || !buf_size) {
50+ std::cerr << " Received empty buffer" << ' \n ' ;
51+ if (valid_buf_size) {
52+ ::operator delete (buf);
53+ }
54+ return ;
55+ }
56+ pti_view_record_base *ptr = nullptr ;
57+ while (true ) {
58+ auto buf_status = ptiViewGetNextRecord (buf, valid_buf_size, &ptr);
59+ if (buf_status == pti_result::PTI_STATUS_END_OF_BUFFER) {
60+ std::cout << " Reached End of buffer" << ' \n ' ;
61+ break ;
62+ }
63+ if (buf_status != pti_result::PTI_SUCCESS) {
64+ std::cerr << " Found Error Parsing Records from PTI" << ' \n ' ;
65+ break ;
66+ }
67+ switch (ptr->_view_kind ) {
68+ case pti_view_kind::PTI_VIEW_INVALID: {
69+ std::cout << " Found Invalid Record" << ' \n ' ;
70+ break ;
71+ }
72+ case pti_view_kind::PTI_VIEW_DEVICE_GPU_MEM_COPY: {
73+ std::cout << " ---------------------------------------------------"
74+ " -----------------------------"
75+ << ' \n ' ;
76+ pti_view_record_memory_copy *rec =
77+ reinterpret_cast <pti_view_record_memory_copy *>(ptr);
78+ runtime_enq_2_gpu_mem_op_name_map[rec->_correlation_id ] =
79+ rec->_name ;
80+ std::cout << " Found Memory Record" << ' \n ' ;
81+ samples_utils::dump_record (rec);
82+ std::cout << " ---------------------------------------------------"
83+ " -----------------------------"
84+ << ' \n ' ;
85+ break ;
86+ }
87+ case pti_view_kind::PTI_VIEW_DEVICE_GPU_MEM_FILL: {
88+ std::cout << " ---------------------------------------------------"
89+ " -----------------------------"
90+ << ' \n ' ;
91+ pti_view_record_memory_fill *rec =
92+ reinterpret_cast <pti_view_record_memory_fill *>(ptr);
93+ runtime_enq_2_gpu_mem_op_name_map[rec->_correlation_id ] =
94+ rec->_name ;
95+ std::cout << " Found Memory Record" << ' \n ' ;
96+ samples_utils::dump_record (rec);
97+ std::cout << " ---------------------------------------------------"
98+ " -----------------------------"
99+ << ' \n ' ;
100+ break ;
101+ }
102+ case pti_view_kind::PTI_VIEW_DEVICE_GPU_KERNEL: {
103+ std::cout << " ---------------------------------------------------"
104+ " -----------------------------"
105+ << ' \n ' ;
106+ pti_view_record_kernel *rec =
107+ reinterpret_cast <pti_view_record_kernel *>(ptr);
108+ runtime_enq_2_gpu_kernel_name_map[rec->_correlation_id ] =
109+ rec->_name ;
110+ std::cout << " Found Kernel Record" << ' \n ' ;
111+ samples_utils::dump_record (rec);
112+
113+ std::cout << " ---------------------------------------------------"
114+ " -----------------------------"
115+ << ' \n ' ;
116+ if (samples_utils::isMonotonic (
117+ {rec->_sycl_task_begin_timestamp ,
118+ rec->_sycl_enqk_begin_timestamp , rec->_append_timestamp ,
119+ rec->_submit_timestamp , rec->_start_timestamp ,
120+ rec->_end_timestamp })) {
121+ std::cout << " ------------> All Monotonic" << std::endl;
122+ } else {
123+ std::cerr
124+ << " ------------> Something wrong: NOT All monotonic"
125+ << std::endl;
126+ };
127+ if (rec->_sycl_task_begin_timestamp == 0 ) {
128+ std::cerr << " ------------> Something wrong: Sycl Task "
129+ " Begin Time is 0"
130+ << std::endl;
131+ }
132+ if (rec->_sycl_enqk_begin_timestamp == 0 ) {
133+ std::cerr << " ------------> Something wrong: Sycl Enq "
134+ " Launch Kernel Time is 0"
135+ << std::endl;
136+ }
137+
138+ break ;
139+ }
140+ case pti_view_kind::PTI_VIEW_EXTERNAL_CORRELATION: {
141+ std::cout << " ---------------------------------------------------"
142+ " -----------------------------"
143+ << ' \n ' ;
144+ pti_view_record_external_correlation *rec =
145+ reinterpret_cast <pti_view_record_external_correlation *>(ptr);
146+
147+ external_corr_map[std::pair{rec->_external_kind ,
148+ rec->_external_id }]
149+ .push_back (rec->_correlation_id );
150+ samples_utils::dump_record (rec);
151+ break ;
152+ }
153+ case pti_view_kind::PTI_VIEW_OPENCL_CALLS: {
154+ std::cout << " ---------------------------------------------------"
155+ " -----------------------------"
156+ << ' \n ' ;
157+ pti_view_record_oclcalls *rec =
158+ reinterpret_cast <pti_view_record_oclcalls *>(ptr);
159+ samples_utils::dump_record (rec);
160+ break ;
161+ }
162+ default : {
163+ std::cerr << " This shouldn't happen" << ' \n ' ;
164+ break ;
165+ }
166+ }
167+ }
168+ ::operator delete (buf);
169+ });
170+ ptiViewSetOclProfiling ();
171+
172+ ptiViewEnable (PTI_VIEW_DEVICE_GPU_KERNEL);
173+ ptiViewEnable (PTI_VIEW_DEVICE_GPU_MEM_COPY);
174+ ptiViewEnable (PTI_VIEW_DEVICE_GPU_MEM_FILL);
175+ ptiViewEnable (PTI_VIEW_OPENCL_CALLS);
176+ ptiViewEnable (PTI_VIEW_EXTERNAL_CORRELATION);
177+ }
178+
179+ ~GPUKernelTracer () {
180+ gcLogD (" Profiling is finished." );
181+ ptiViewDisable (PTI_VIEW_DEVICE_GPU_KERNEL);
182+ ptiViewDisable (PTI_VIEW_DEVICE_GPU_MEM_COPY);
183+ ptiViewDisable (PTI_VIEW_DEVICE_GPU_MEM_FILL);
184+ ptiViewEnable (PTI_VIEW_OPENCL_CALLS);
185+ ptiViewDisable (PTI_VIEW_EXTERNAL_CORRELATION);
186+ ptiFlushAllViews ();
187+ }
188+ };
189+
190+ /*
191+ Create an RAII tracer with a static life cycle to trace all device kernel
192+ execution during the program. When the tracer's constructor is called, the
193+ EnableProfiling will also be called, registering some metric collection
194+ call-back function into the opencl function call. When the tracer is destroyed,
195+ the DisableProfiling is also called which will statistic the collected metric
196+ during the tracer lifetime and print the result. The concrete implementation of
197+ EnableProfiling and DisableProfiling could refer to
198+ https://github.com/intel/pti-gpu/blob/master/tools/onetrace/tool.cc.
199+ */
200+ static GPUKernelTracer tracer;
201+
202+ #endif
24203
25204namespace mlir ::gc::gpu {
26205
@@ -128,10 +307,9 @@ struct Kernel {
128307
129308 explicit Kernel (cl_program program, cl_kernel kernel, const size_t *gridSize,
130309 const size_t *blockSize, size_t argNum, const size_t *argSize)
131- : program(program),
132- kernel(kernel), globalSize{gridSize[0 ] * blockSize[0 ],
133- gridSize[1 ] * blockSize[1 ],
134- gridSize[2 ] * blockSize[2 ]},
310+ : program(program), kernel(kernel),
311+ globalSize{gridSize[0 ] * blockSize[0 ], gridSize[1 ] * blockSize[1 ],
312+ gridSize[2 ] * blockSize[2 ]},
135313 localSize{blockSize[0 ], blockSize[1 ], blockSize[2 ]},
136314 argSize (argSize, argSize + argNum) {
137315#ifndef NDEBUG
@@ -1014,4 +1192,4 @@ OclModuleBuilder::build(const OclRuntime::Ext &ext) {
10141192 return cache.emplace (OclDevCtxPair (ext.device , ext.context ), ptr)
10151193 .first ->second ;
10161194}
1017- } // namespace mlir::gc::gpu
1195+ } // namespace mlir::gc::gpu
0 commit comments